Ejemplo n.º 1
0
def main():
    usage_str = 'Usage:\n' \
                '1. Dir for pairs rdd.\n' \
                '2. Dir for matrix output - should also contain the info_dict json with num of rows and cols.\n' \
                '3. Name of output matrix.'
    if (len(sys.argv) != 4):
        print(usage_str)
        return

    input_rdd_dir = sys.argv[1]
    output_dir = sys.argv[2]
    out_name = sys.argv[3]

    conf = SparkConf().set("spark.driver.maxResultSize", "30G"). \
        set("spark.hadoop.validateOutputSpecs", "false"). \
        set('spark.default.parallelism', '100')
    spark = SparkContext.getOrCreate(conf=conf)

    rdd = load_three_tuple_rdd(spark, input_rdd_dir)
    info_dict = load_dict(add_slash_to_dir(output_dir) + 'info_dict.json')
    n_rows = int(info_dict['rows'])
    n_cols = int(info_dict['cols'])
    result_mat = tuples_rdd_to_csr(rdd, (n_rows, n_cols))

    f1 = open(add_slash_to_dir(output_dir) + out_name + '_sparse_scipy.pickle',
              mode='wb')
    pickle.dump(result_mat, f1)
    f1.close()
def main():

    usage_str = 'Receives a matrix in rdd form and outputs a list of holdout pairs.\n' \
                'Args:\n' \
                '1. Dir of matrix in HDFS\n' \
                '2. Output dir (non-HDFS)\n' \
                '3. -b for both nonzeros and zeros, -nz for nonzeros only. (any other input will default to -nz)'

    if (len(sys.argv) != 4):
        print usage_str
        return

    hdfs_dir = sys.argv[1]
    output_dir = sys.argv[2]
    nonzeros_option = sys.argv[3]

    conf = SparkConf().set("spark.driver.maxResultSize", "4G").\
            set('spark.default.parallelism', '200')
    spark = SparkContext(conf=conf)
    test_rdd_matrix = load_count_vector_matrix(
        spark, hdfs_dir).map(lambda x: x[1]).zipWithIndex()
    if (nonzeros_option == '-b'):
        holdout_rdd = test_rdd_matrix.map(
            lambda x: holdout_row_both(x[0], x[1]))
    else:
        holdout_rdd = test_rdd_matrix.map(
            lambda x: holdout_row_nonzero(x[0], x[1]))

    holdout_pairs = holdout_rdd.reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))

    pickle.dump(
        holdout_pairs,
        open(add_slash_to_dir(output_dir) + 'holdout_pairs.pkl', mode='w'))
def main():
    usage_str = 'Gets the name of the .csv file containing names and ids, and creates and saves the doc id to ' \
                'doc name dictionary.\n' \
                'Args:\n' \
                '1. Input file name.\n' \
                '2. Output dir.'

    if len(sys.argv) != 3:
        print(usage_str)
        return

    input_filename = sys.argv[1]
    output_dir = sys.argv[2]

    dict_result = get_id_name_dict(input_filename)
    print('Dict generated!')
    make_sure_path_exists(add_slash_to_dir(output_dir))

    with open(add_slash_to_dir(output_dir) + 'docid_docname.json', 'w') as f:
        json.dump(dict_result, f)
Ejemplo n.º 4
0
def main():
    if (len(sys.argv) != 2):
        print(
            'Please give the input directory containing the histogram data (the output dir of create_doc_edit_histograms.py).'
        )
        return
    input_dir = sys.argv[1]
    doc_lengths = pickle.load(
        open(add_slash_to_dir(input_dir) + 'doc_lengths.pkl', mode='rb'))
    doc_edit_total = pickle.load(
        open(add_slash_to_dir(input_dir) + 'doc_edit_total.pkl', mode='rb'))
    doc_edit_distinct = pickle.load(
        open(add_slash_to_dir(input_dir) + 'doc_edit_distinct.pkl', mode='rb'))
    user_edit_total = pickle.load(
        open(add_slash_to_dir(input_dir) + 'user_edit_total.pkl', mode='rb'))
    doc_keep_list = [
        x for x in doc_lengths if doc_lengths[x] >= DOC_STUB_THRESH
    ]
    user_keep_list = [x for x in user_edit_total]
    print('Doc total edit average after stub removal')
    print(sum([doc_edit_total[x] for x in doc_keep_list]) / len(doc_keep_list))
    print('Doc distinct edit average after stub removal')
    print(
        sum([doc_edit_distinct[x]
             for x in doc_keep_list]) / len(doc_keep_list))
    print('Length of doc keep list:')
    print(len(doc_keep_list))
    print('Length of user keep list:')
    print(len(user_keep_list))
    pickle.dump(
        user_keep_list,
        open(add_slash_to_dir(input_dir) + 'user_keep_list.pkl', mode='wb'))
    pickle.dump(
        doc_keep_list,
        open(add_slash_to_dir(input_dir) + 'doc_keep_list.pkl', mode='wb'))
Ejemplo n.º 5
0
def main():
    usage_str = 'Creates a user-doc count-vectorised matrix out of revision history data. Can do a train-test split. ' \
                'Has two filtering schemes, which are determined by the first argument. If first arg is -old, the ' \
                'old approach is used, which is deprecated and will be removed. If it is -new, the new approach ' \
                'is used. For -old, the args are:\n' \
                '\t1. The input file.\n' \
                '\t2. The directory for the RDD output.\n' \
                '\t3. The directory for the json dictionary and test id list (if to be saved).\n' \
                '\t4. The name of the non-admin pages file.\n' \
                '\t5. Name of bot names file to filter them out, -none for no such filtering\n' \
                '\t6. Name of list of docs to discard, -none for no such thing\n' \
                '\t7. Train/test split fraction. Must be a float in the range [0,1). If it\'s 0, no split is performed.\n' \
                '\t8. If it is -f then inactive users are filtered, settings in constants.py. -nf for no filtering.\n' \
                'For -new, the args are:\n' \
                '\t1. The input file.\n' \
                '\t2. The directory for the RDD output.\n' \
                '\t3. The directory for the json dictionary and test id list (if to be saved).\n' \
                '\t4. Name of doc keep-list. -none to keep all.\n' \
                '\t5. Name of user keep-list. -none to keep all.\n' \
                '\t6. Train/test split fraction. Must be a float in the range [0,1). If 0, no split is performed.\n' \
                'In this case, user and doc lower thresholds and the doc upper threshold are set in constants.py.'
    if (len(sys.argv) < 2):
        print(usage_str)
        return
    if (sys.argv[1] == '-old'):
        old_mode = True
        if len(sys.argv) != 10:
            print(usage_str)
            return

        filter_inactives = False

        input_filename = sys.argv[2]
        output_dir_rdd = sys.argv[3]
        output_dir_dict = sys.argv[4]
        nonadmin_pages_filename = sys.argv[5]
        bots_filename = sys.argv[6]
        discard_list_filename = sys.argv[7]
        if (bots_filename == '-none'):
            bots_filename = None
        if (discard_list_filename == '-none'):
            discard_list_filename = None
        if (nonadmin_pages_filename == '-none'):
            nonadmin_pages_filename = None
        split_frac = 0
        try:
            split_frac = float(sys.argv[8])
            if (split_frac > 1 or split_frac < 0):
                print(usage_str)
                return
        except:
            print(usage_str)
            return

        if (sys.argv[9] == '-f'):
            filter_inactives = True
        elif (sys.argv[9] == '-nf'):
            filter_inactives = False
        else:
            print(usage_str)
            return
    else:
        # TODO Add an option for getting an edit count matrix vs getting an edit size matrix.
        # Edit count option just uses the existing code, while edit size matrix uses the new code.
        old_mode = False
        if len(sys.argv) != 8:
            print(usage_str)
            return
        input_filename = sys.argv[2]
        output_dir_rdd = sys.argv[3]
        output_dir_dict = sys.argv[4]
        doc_keep_list = sys.argv[5]
        user_keep_list = sys.argv[6]
        if (doc_keep_list == '-none'):
            doc_keep_list = None
        if (user_keep_list == '-none'):
            user_keep_list = None
        split_frac = 0
        try:
            split_frac = float(sys.argv[7])
            if (split_frac > 1 or split_frac < 0):
                print(usage_str)
                return
        except:
            print(usage_str)
            return

    spark = SparkContext.getOrCreate()
    input_rdd = tsv_to_rdd(spark, input_filename)

    if (old_mode):
        filtered_rdd = filter_user_doc_data(
            spark,
            input_rdd,
            pages_filename=nonadmin_pages_filename,
            user_freq_filtration=filter_inactives,
            bots_filename=bots_filename,
            doc_discard_list_filename=discard_list_filename)
    else:
        filtered_rdd = iterative_keeplist_filtering(
            spark,
            input_rdd,
            user_keep_list_filename=user_keep_list,
            doc_keep_list_filename=doc_keep_list)
    print('**********************Filtering complete************************')
    col_dict, count_vec_matrix = create_user_doc_count(spark,
                                                       filtered_rdd,
                                                       removeAnonymous=True)
    print(
        '******************Data matrix creation completed!*************************'
    )

    train_ids_list = None
    test_ids_list = None
    if (split_frac != 0):
        train_ids_list, test_ids_list = train_test_split_ids(
            spark, count_vec_matrix, test_frac=split_frac, id_col=0)

    save_dict(output_dir_dict, col_dict)

    if (split_frac == 0):
        save_rdd_mat(output_dir_rdd, count_vec_matrix)
    else:
        train_count_vec = get_sub_rdd_by_id_list(spark, count_vec_matrix,
                                                 train_ids_list)
        test_count_vec = get_sub_rdd_by_id_list(spark, count_vec_matrix,
                                                test_ids_list)
        save_rdd_mat(
            add_slash_to_dir(output_dir_rdd) + 'train', train_count_vec)
        save_rdd_mat(add_slash_to_dir(output_dir_rdd) + 'test', test_count_vec)

    if (test_ids_list is not None):
        save_id_list([int(test_id) for test_id in test_ids_list],
                     output_dir_dict)

    n_rows = count_vec_matrix.count()
    n_cols = count_vec_matrix.first()[1].size

    info_dict = {
        'rows': n_rows,
        'cols': n_cols,
        'split fraction': split_frac,
        'doc_filter_thresh': DOC_INACTIVITY_THRESH,
        'user_inactivity_thresh': USER_INACTIVITY_THRESH,
        'user_bot_thresh': USER_BOT_THRESH,
        'doc_stub_thresh': DOC_STUB_THRESH
    }
    save_dict(output_dir_dict, info_dict, 'info_dict.json')

    print('Total number of rows:\n' + str(n_rows) +
          '\nTotal number of cols:\n' + str(n_cols))
Ejemplo n.º 6
0
def main():
    usage_str = 'Gets a scipy sparse matrix in pickle form, calculates its k top singular vectors ' \
                '(w/o mean-centering).\n' \
                'Args:\n' \
                '1. Input file dir\n' \
                '2. Number of singular vectors desired (max 1000, default 20)\n' \
                '3. -w for inverse row freq weighting, -n for normal\n' \
                '4. List of rows to keep (-none for none)\n' \
                '5. -b to binarise the input, -nb to leave it be.\n' \
                '6. Name of the input file.\n' \
                '7. Whether or not to transpose the original matrix. -t to transpose, -n otherwise.'
    if (len(sys.argv) != 8):
        print(usage_str)
        return

    input_dir = sys.argv[1]
    k = 20
    try:
        k = int(sys.argv[2])
    except:
        k = 20

    if (k > 1000 or k < 0):
        k = 20
    do_weighting = False
    if (sys.argv[3] == '-w'):
        do_weighting = True
    elif (sys.argv[3] == '-n'):
        do_weighting = False
    else:
        print(usage_str)
        return
    keep_users_filename = sys.argv[4]
    binarise = False
    if (sys.argv[5] == '-b'):
        binarise = True
    elif (sys.argv[5] == '-nb'):
        binarise = False
    else:
        print(usage_str)
        return
    if (keep_users_filename != '-none'):
        keep_rows = np.array(pickle.load(open(keep_users_filename, mode='rb')))
    else:
        keep_rows = None
    input_filename = sys.argv[6]
    transpose_input = False
    if sys.argv[7] == '-t':
        transpose_input = True
    elif sys.argv[7] == '-n':
        transpose_input = False
    else:
        print(usage_str)
        return

    in_file = open(add_slash_to_dir(input_dir) + input_filename, mode='rb')
    X = pickle.load(in_file)
    if (keep_rows is not None):
        X = X[keep_rows, :]
    if transpose_input:
        X = X.transpose()
    if (binarise):
        X = csr_matrix((np.array([1] * len(X.data)), X.indices, X.indptr),
                       shape=X.shape)
    if (do_weighting):
        X = inv_row_freq_weighting(X)

    singular_vecs = get_singular_vecs(X, k)
    f1 = open(add_slash_to_dir(input_dir) + 'v_sing_vecs.pkl', mode='wb')
    pickle.dump(singular_vecs, f1)
    f1.close()
    print('V singular vecs saved')
    f2 = open(add_slash_to_dir(input_dir) + 'utimessigma_sing_vecs.pkl',
              mode='wb')
    utimessigma = X.dot(singular_vecs[:, 1:])
    pickle.dump(utimessigma, f2)
    f2.close()
    print('U*Sigma saved')
Ejemplo n.º 7
0
def main():
    usage_str = '1. Concept-doc file name, -none if there isn\'t any.\n' \
                '2. Concept-word-count file name\n' \
                '3. Output rdd dir\n' \
                '4. Dir for doc filtering dict, optional\n' \
                '5. Dir for output dict'

    if (len(sys.argv) != 6):
        print(usage_str)
        return
    concept_doc_filename = sys.argv[1]
    concept_term_filename = sys.argv[2]
    output_dir_rdd = sys.argv[3]
    filter_dict_dir = sys.argv[4]
    output_dir_dict = sys.argv[5]

    conf = SparkConf().set("spark.driver.maxResultSize", "2G").\
        set("spark.hadoop.validateOutputSpecs", "false").\
        set('spark.default.parallelism', '100')
    spark = SparkContext.getOrCreate(conf=conf)

    #Making the concept-doc map to convert concept values in the concept-doc rdd to doc-term... or not.

    c2dmap_bc = None
    if concept_doc_filename != '-none':
        rdd1 = tsv_to_rdd(spark, concept_doc_filename)
        concept_to_doc_map = generate_concept_doc_dict(rdd1)
        c2dmap_bc = spark.broadcast(concept_to_doc_map)

    #Loading the concept-term rdd and mapping the concepts to docs.

    rdd2 = tsv_to_rdd(spark, concept_term_filename)
    rdd2 = rdd2.map(lambda x: (int(x[0]), (int(x[1]), int(x[2]))))

    if c2dmap_bc is not None:
        rdd2 = rdd2.map(lambda x: (c2dmap_bc.value[x[0]], x[1]))

    doc_row_dict = None

    #Here we filter the docs if a dict of ids and indices of docs we want to keep is given as input
    # (e.g. when docs are already columns of a user-doc matrix).
    if (filter_dict_dir != '-none'):
        filtering_dict = intify_dict(load_dict(add_slash_to_dir(filter_dict_dir) + 'col_dict.json'))
        filter_dict_bc = spark.broadcast(filtering_dict)
        rdd2 = rdd2.filter(lambda x: x[0] in filter_dict_bc.value)
        rdd2 = rdd2.map(lambda x: (filter_dict_bc.value[x[0]], x[1]))
        #very important, because filtering with a dict means that we want the number of docs here
        #to be equal to number of docs in that dict (for matrix multiplication).
        docs_num = len(filtering_dict)
    else:
        doc_row_dict = rdd2.map(lambda x: x[0]).distinct().zipWithIndex().collectAsMap()
        doc_row_dict_bc = spark.broadcast(doc_row_dict)
        rdd2 = rdd2.map(lambda x: (doc_row_dict_bc.value[x[0]], x[1]))
        docs_num = len(doc_row_dict)


    #Now we calculate word frequencies. This will be used both for filtering and for calculating IDF.
    word_doc_freq = rdd2.map(lambda x: (x[1][0], 1)).reduceByKey(add).collectAsMap()
    #Filtering now
    doc_freq_bc = spark.broadcast(word_doc_freq)
    rdd2 = rdd2.filter(lambda x: doc_freq_bc.value[x[1][0]] < DOC_FREQ_UPPER_REL * docs_num and
                                 doc_freq_bc.value[x[1][0]] > DOC_FREQ_LOWER_ABS)

    #Now we calculate TF-IDF and instead of doc-word-count we have doc-word-tf_idf. Also filtering out those docs
    #with 0 tf_idf value.

    tf_idf_pairs = rdd2.map(lambda x: (x[0], x[1][0], np.log(1+x[1][1]) * np.log(docs_num / doc_freq_bc.value[x[1][0]])))
    tf_idf_pairs = tf_idf_pairs.filter(lambda x: x[2] > 0)

    #Now we want to make map the word ids of the rdd to word indices which are actually column indices for a
    #doc-term matrix.

    word_index_dict = tf_idf_pairs.map(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()
    word_index_d_bc = spark.broadcast(word_index_dict)
    tf_idf_pairs = tf_idf_pairs.map(lambda x: (x[0], word_index_d_bc.value[x[1]], x[2]))

    words_num = len(word_index_dict)
    total_count = tf_idf_pairs.count()

    #count_vec_matrix = tf_idf_pairs.reduceByKey(add).map(lambda x: (x[0], SparseVector(words_num, {a[0]:a[1] for a in x[1]})))

    #Now saving.
    tf_idf_pairs.saveAsTextFile(add_slash_to_dir(output_dir_rdd))
    print('************************Saved RDD, now saving dicts*****************************')
    info_dict = {'rows': docs_num, 'cols': words_num, 'vals': total_count, 'upper_df_thresh':DOC_FREQ_UPPER_REL,
                 'lower_df_thresh':DOC_FREQ_LOWER_ABS}
    save_dict(output_dir_dict, info_dict, 'info_dict.json')
    save_dict(output_dir_dict, word_index_dict)
    if (doc_row_dict is not None):
        save_dict(output_dir_dict, doc_row_dict, 'row_dict.json')
def main():
    usage_str = 'Takes an rdd matrix, converts it into a csr matrix. The rdd matrix can either be (id, SparseVector)' \
                ', in which case we use indices instead of the ids, or already indexed as (index, SparseVector), and' \
                'in the latter case there is also the option of filtering the rows using a dictionary ' \
                '(this is for the doc-term matrix), which maps ' \
                'the ids to indices. We also save the row dict.\n' \
                '1. rdd dir name\n' \
                '2. output dir for matrix\n' \
                '3. -b for binarise, -n otherwise\n' \
                '4. name for the output file\n' \
                '5. dir for doc filtering dict, optional, -invert for special case of already indexed input rdd'

    if (len(sys.argv) < 5 or len(sys.argv) > 6):
        print(usage_str)
        return

    input_rdd_dir = sys.argv[1]
    output_dir = sys.argv[2]
    to_bin = True

    if (sys.argv[3] == '-n'):
        to_bin = False
    elif (sys.argv[3] == '-b'):
        to_bin = True
    else:
        print(usage_str)
        return

    out_name = sys.argv[4]

    input_dict_dir = None
    already_indexed = False
    invert_rdd = False
    if (len(sys.argv) == 6):
        input_dict_dir = sys.argv[5]
        if input_dict_dir == '-invert':
            already_indexed = True
            invert_rdd = True
            input_dict_dir = None


    conf = SparkConf().set("spark.driver.maxResultSize", "30G").\
        set("spark.hadoop.validateOutputSpecs", "false").\
        set('spark.default.parallelism', '100')
    spark = SparkContext.getOrCreate(conf=conf)

    count_vec_matrix = load_count_vector_matrix(spark, add_slash_to_dir(input_rdd_dir)+'count_vector_matrix')
    if (to_bin):
        count_vec_matrix = binarise_rdd(count_vec_matrix)

    if (input_dict_dir is not None):
        in_dict = intify_dict(load_dict(add_slash_to_dir(input_dict_dir)+'col_dict.json'))
        dict_broadcast = spark.broadcast(in_dict)
        count_vec_matrix = count_vec_matrix.filter(lambda x: x[0] in dict_broadcast.value)
        count_vec_matrix = count_vec_matrix.map(lambda x: (x[1], dict_broadcast.value[x[0]]))
        already_indexed = True
        cols_num = count_vec_matrix.first()[0].size
    else:
        cols_num = count_vec_matrix.first()[1].size
        if (invert_rdd):
            count_vec_matrix = count_vec_matrix.map(lambda x: (x[1], x[0]))

    rows_num = count_vec_matrix.count()
    print('shape calculated!')
    result = mat_rdd_to_csr(count_vec_matrix, (rows_num, cols_num), already_indexed=already_indexed)

    row_dict = count_vec_matrix.map(lambda x:x[0]).zipWithIndex().collectAsMap()

    json.dump(row_dict, open(add_slash_to_dir(output_dir)+out_name+'_row_dict.json', mode='w'))

    f1 = open(add_slash_to_dir(output_dir)+out_name+'_sparse_scipy.pickle', mode='wb')
    pickle.dump(result, f1)
    f1.close()
def main():
    usage_str = 'This script produces doc and user edit counts (total and distinct) and doc sizes, using the ' \
                'following steps:\n' \
                '\t* Removing bots.\n' \
                '\t* Removing administrative pages.\n' \
                '\t* Calculating the doc lengths by counting the number of their words.' \
                '\t* Calculating total and distinct edit counts for users and docs.' \
                'Args:\n' \
                '1. The input revision history file.\n' \
                '2. The directory for the output files (all are dicts).\n' \
                '3. The name of the non-admin pages file.\n' \
                '4. Name of bot names file to filter them out.\n' \
                '5. Concept-doc file name, -none if nonexistent.\n' \
                '6. Concept-word-count file name\n' \
                '7. Whether to filter out by name or not. -f filters, -n does not.'
    if (len(sys.argv) != 8):
        print(usage_str)
        return
    input_filename = sys.argv[1]
    output_dir = sys.argv[2]
    nonadmin_pages_filename = sys.argv[3]
    bots_filename = sys.argv[4]
    concept_doc_filename = sys.argv[5]
    concept_term_filename = sys.argv[6]
    filter_by_name = False
    if (sys.argv[7] == '-f'):
        filter_by_name = True
    elif (sys.argv[7] == '-n'):
        filter_by_name = False
    else:
        print(usage_str)
        return
    conf = SparkConf().set("spark.driver.maxResultSize",
                           "10G").set('spark.default.parallelism', '100')
    spark = SparkContext.getOrCreate(conf=conf)
    input_rdd = tsv_to_rdd(spark, input_filename)

    filtered_rdd = filter_user_doc_data(spark,
                                        input_rdd,
                                        pages_filename=nonadmin_pages_filename,
                                        admin_filtration=True,
                                        bots_filename=bots_filename,
                                        doc_discard_list_filename=None,
                                        user_discard_list_filename=None,
                                        user_freq_filtration=False,
                                        doc_freq_filtration=False,
                                        discard_by_name=filter_by_name)
    remaining_docs = filtered_rdd.map(lambda x:
                                      (x[1], 1)).distinct().collectAsMap()

    c2dmap_bc = None
    if concept_doc_filename != '-none':
        rdd1 = tsv_to_rdd(spark, concept_doc_filename)
        concept_to_doc_map = generate_concept_doc_dict(rdd1)
        c2dmap_bc = spark.broadcast(concept_to_doc_map)

    remaining_docs_bc = spark.broadcast(remaining_docs)
    rdd2 = tsv_to_rdd(spark, concept_term_filename)
    rdd2 = rdd2.map(lambda x: (int(x[0]), int(x[2])))

    if c2dmap_bc is not None:
        rdd2 = rdd2.map(lambda x: (c2dmap_bc.value[x[0]], x[1]))

    rdd2 = rdd2.filter(lambda x: x[0] in remaining_docs_bc.value)

    doc_lengths = rdd2.reduceByKey(add).collectAsMap()
    doc_edit_total = filtered_rdd.map(lambda x: (x[1], 1)).reduceByKey(
        add).collectAsMap()
    doc_edit_distinct = filtered_rdd.distinct().map(
        lambda x: (x[1], 1)).reduceByKey(add).collectAsMap()
    user_edit_total = filtered_rdd.map(lambda x: (x[0], 1)).reduceByKey(
        add).collectAsMap()
    user_edit_distinct = filtered_rdd.distinct().map(
        lambda x: (x[0], 1)).reduceByKey(add).collectAsMap()

    print('Doc total edit average')
    print(
        sum([doc_edit_total[x] for x in doc_edit_total]) / len(doc_edit_total))
    print('Doc distinct edit average')
    print(
        sum([doc_edit_distinct[x]
             for x in doc_edit_distinct]) / len(doc_edit_distinct))
    print('User total edit average')
    print(
        sum([user_edit_total[x]
             for x in user_edit_total]) / len(user_edit_total))
    print('User distinct edit average')
    print(
        sum([user_edit_distinct[x]
             for x in user_edit_distinct]) / len(user_edit_distinct))
    print('Doc length average')
    print(sum([doc_lengths[x] for x in doc_lengths]) / len(doc_lengths))

    pickle.dump(
        doc_lengths,
        open(add_slash_to_dir(output_dir) + 'doc_lengths.pkl', mode='wb'))
    pickle.dump(
        doc_edit_total,
        open(add_slash_to_dir(output_dir) + 'doc_edit_total.pkl', mode='wb'))
    pickle.dump(
        doc_edit_distinct,
        open(add_slash_to_dir(output_dir) + 'doc_edit_distinct.pkl',
             mode='wb'))
    pickle.dump(
        user_edit_total,
        open(add_slash_to_dir(output_dir) + 'user_edit_total.pkl', mode='wb'))
    pickle.dump(
        user_edit_distinct,
        open(add_slash_to_dir(output_dir) + 'user_edit_distinct.pkl',
             mode='wb'))
def main():
    usage_str = 'Performs offline testing of CF recommendations using user edit histories.\n' \
                'ATTENTION: The library "implicit" used in this script requires Python 3. Do not attempt running ' \
                'with Python 2.\n' \
                'Args:\n' \
                '1. Training user-doc matrix.\n' \
                '2. Test user-doc matrix.\n' \
                '3. Holdout pairs.\n' \
                '4. Output dir'

    if len(sys.argv) != 5:
        print(usage_str)
        return

    train_ud_filename = sys.argv[1]
    test_ud_filename = sys.argv[2]
    holdout_filename = sys.argv[3]
    output_dir = sys.argv[4]

    at_ks = [20, 50, 100, 200, 300]

    E_train = pickle.load(open(train_ud_filename, 'rb'), encoding='latin1')
    E_test = pickle.load(open(test_ud_filename, mode='rb'), encoding='latin1')
    heldout_pairs = pickle.load(open(holdout_filename, mode='rb'),
                                encoding='latin1')

    test_users, test_docs = heldout_pairs
    E_test_modified = erase_heldout(E_test, heldout_pairs)

    print('Data loaded, starting creation of training matrix...')

    n_train_users = E_train.shape[0]
    training_mat = vstack([E_train, E_test_modified]).transpose().tocsr()

    print('Starting training...')

    model = implicit.als.AlternatingLeastSquares(factors=50)
    model.fit(training_mat)

    max_at_k = max(at_ks)
    recommended_pairs = []
    user_counter = 0

    recommendation_test_mat = training_mat.transpose().tocsr()

    print('Calculating recommendations')

    for user_index in np.unique(test_users):
        user_counter += 1
        if user_counter % 100 == 0:
            print(user_counter)

        nonzero_indices = set(E_test_modified[user_index, :].nonzero()[1])
        user_index_in_training_mat = n_train_users + user_index
        article_index_ranking = model.recommend(
            user_index_in_training_mat,
            recommendation_test_mat,
            N=max_at_k,
            filter_already_liked_items=True)
        article_index_ranking = [x[0] for x in article_index_ranking]
        new_recommended_pairs = rankings_to_recommendation_tuples(
            article_index_ranking,
            max_at_k,
            user_index,
            E_test,
            nonzero_indices,
            ascending=False)
        recommended_pairs.extend(new_recommended_pairs)

    result_dict = py3_recoms_to_prec_recall(recommended_pairs, at_ks)
    make_sure_path_exists(add_slash_to_dir(output_dir))

    output_text = open(add_slash_to_dir(output_dir) + 'prec_and_recall_cf.txt',
                       mode='w')

    save_textual_desc_prec_and_recall(at_ks, output_text, result_dict)

    pickle.dump(
        result_dict,
        open(add_slash_to_dir(output_dir) + 'out_dict_cf.pkl', mode='wb'))
Ejemplo n.º 11
0
def main():
    usage_str = 'Takes the test user-doc matrix, heldout pairs (a tuple of two lists) and the doc latent matrix, ' \
                'calculates answers to questionnaire based on non-heldout data for test users, and evaluates on ' \
                'the heldout pairs, with precision@k and recall@k.\n' \
                'Args:\n' \
                '1. Test user-doc matrix\n' \
                '2. Heldout pairs\n' \
                '3. Doc latent matrix\n' \
                '4. Output dir\n' \
                '5. Number of questions to consider.\n' \
                '6. Whether to turn off stratification. -s to stratify, -n otherwise.'

    if (len(sys.argv) != 7):
        print(usage_str)
        return

    input_matrix_name = sys.argv[1]
    heldout_pairs_name = sys.argv[2]
    doc_latent_name = sys.argv[3]
    #question_filename = sys.argv[4]
    output_dir = sys.argv[4]
    holdout_option = '-nz'
    n_q = int(sys.argv[5])
    stratify = True
    if sys.argv[6] == '-s':
        stratify = True
    elif sys.argv[6] == '-n':
        stratify = False
    else:
        print(usage_str)
        return

    E_test = pickle.load(open(input_matrix_name, mode='rb'))
    print(E_test.shape)
    heldout_pairs = pickle.load(open(heldout_pairs_name, mode='r'))
    print('Number of test users:')
    print(np.unique(heldout_pairs[0]).size)
    doc_latent = pickle.load(open(doc_latent_name, mode='rb'))
    print(doc_latent.shape)

    if (n_q > doc_latent.shape[1]):
        n_q = doc_latent.shape[1]

    doc_latent_d = n_q
    doc_latent = doc_latent[:, :doc_latent_d]

    at_ks = [20, 50, 100, 200, 300]
    levels = 7
    result_dict = dotproduct_test_article_space(
        E_test,
        heldout_pairs,
        doc_latent,
        n_q=n_q,
        at_ks=at_ks,
        user_holdout_size=DEFAULT_HOLDOUT_SIZE,
        levels=levels,
        col_normalise=False,
        stratify=stratify)

    make_sure_path_exists(add_slash_to_dir(output_dir))
    output_filename = 'q_based_out_dict' + holdout_option
    if stratify:
        output_filename = output_filename + '_stratified.pkl'
    else:
        output_filename = output_filename + '_unstratified.pkl'
    pickle.dump(
        result_dict,
        open(add_slash_to_dir(output_dir) + output_filename, mode='wb'))

    output_text = open(add_slash_to_dir(output_dir) + 'prec_and_recall' +
                       holdout_option + '_' + str(levels) + '_' + str(n_q) +
                       '_.txt',
                       mode='w')
    save_textual_desc_prec_and_recall(at_ks, output_text, result_dict)
def main():
    usage_str = 'This script receives a user-doc edit matrix (non-binary) and a doc-term latent representation matrix. ' \
                'It computes two matrices, a latent user matrix and a latent doc matrix, both in the same latent ' \
                'space as the input doc-term matrix.\n' \
                'Running modes are -b for batch and -i for individual. ' \
                'If first arg is -i, then the rest of the args are:\n' \
                '1. Dir and name of the user-doc matrix.\n' \
                '2. Dir and name of the doc-term latent matrix.\n' \
                '3. Output directory.\n' \
                '4. Number of latent dimensions.\n' \
                '5. Name of file containing list of user indices to keep. Use -none if you don\'t want any such filtering.\n' \
                '6. User matrix init mode: -p for (0,1), -s for (-1,1) (both will then have their rows normalised)\n' \
                '7. -f for full training, -v for validation set separation (saves a file for the errors too)\n' \
                '8. alpha and lambda and theta in the format alpha,lambda,theta, e.g. 1,1e-3,1e-4\n' \
                '\nIf the first arg is -b, you should give the address of a file that contains the arguments ' \
                'listed above (each in one line).'
    if (len(sys.argv) < 2):
        print(usage_str)
        return
    mode_arg = sys.argv[1]
    if (mode_arg == '-i'):
        if (len(sys.argv) != 10):
            print(usage_str)
            return
        input_user_doc = sys.argv[2]
        input_doc_term = sys.argv[3]
        output_dir = sys.argv[4]
        n_latent = sys.argv[5]
        user_filter_filename = sys.argv[6]
        symmetry_arg = sys.argv[7]
        validation_arg = sys.argv[8]
        alphalambda = sys.argv[9]
    elif (mode_arg == '-b'):
        if (len(sys.argv) != 3):
            print(usage_str)
            return
        args_filename = sys.argv[2]
        args_file = open(args_filename, mode='r')
        arg_contents = args_file.readlines()
        arg_contents = [x.strip() for x in arg_contents]
        arg_contents = [x for x in arg_contents if len(x) > 0]
        if (len(arg_contents) != 8):
            print(usage_str)
            return
        input_user_doc = arg_contents[0]
        input_doc_term = arg_contents[1]
        output_dir = arg_contents[2]
        n_latent = arg_contents[3]
        user_filter_filename = arg_contents[4]
        symmetry_arg = arg_contents[5]
        validation_arg = arg_contents[6]
        alphalambda = arg_contents[7]
    else:
        print(usage_str)
        return

    try:
        n_latent = int(n_latent)
    except:
        print(usage_str)
        return

    user_init_symmetric = False
    if (symmetry_arg == '-p'):
        user_init_symmetric = False
    elif (symmetry_arg == '-s'):
        user_init_symmetric = True
    else:
        print(usage_str)
        return

    do_validation = False
    if (validation_arg == '-f'):
        do_validation = False
    elif (validation_arg == '-v'):
        do_validation = True
    else:
        print(usage_str)
        return

    split_alphalambda = alphalambda.strip('(').strip(')').split(',')
    gamma_ = -1
    if (len(split_alphalambda) < 3 or len(split_alphalambda) > 4):
        print(usage_str)
        return
    try:
        alpha_ = float(split_alphalambda[0])
        lambda_ = float(split_alphalambda[1])
        theta_ = float(split_alphalambda[2])
        if (len(split_alphalambda) == 4):
            gamma_ = float(split_alphalambda[3])
    except:
        print(usage_str)
        return
    # Loading the input matrices

    print('Loading...')
    ud_in_file = open(input_user_doc, mode='rb')
    user_doc_sparse_mat_original = csr_matrix(pickle.load(ud_in_file))
    ud_in_file.close()

    if (user_filter_filename != '-none'):
        user_filter_list = pickle.load(open(user_filter_filename, mode='rb'))
        user_doc_sparse_mat_original = user_doc_sparse_mat_original[
            user_filter_list, :]

    validation_pairs = None
    if (do_validation):
        user_doc_sparse_mat, validation_pairs = do_validation_split(
            user_doc_sparse_mat_original)
    else:
        user_doc_sparse_mat = user_doc_sparse_mat_original

    print('User-doc matrix loaded')
    dt_in_file = open(input_doc_term, mode='rb')
    doc_original_latent = np.array(pickle.load(dt_in_file))
    doc_original_latent -= np.mean(doc_original_latent, axis=0)
    #doc_original_latent = doc_original_latent / np.linalg.norm(doc_original_latent, axis=0)
    dt_in_file.close()
    print('Loading completed')

    user_doc_nonzero_indices = user_doc_sparse_mat.nonzero()
    n_nonzeros = len(user_doc_nonzero_indices[0])

    if n_latent < doc_original_latent.shape[1] and n_latent > 0:
        doc_original_latent = doc_original_latent[:, 0:n_latent]
    else:
        n_latent = doc_original_latent.shape[1]

    n_users = user_doc_sparse_mat.shape[0]
    n_docs = user_doc_sparse_mat.shape[1]

    doc_original_latent /= (
        np.linalg.norm(doc_original_latent, axis=1).reshape(
            (doc_original_latent.shape[0], 1)) + 1e-60)

    print('Number of users: ' + str(n_users))
    print('Number of docs: ' + str(n_docs))
    print('Number of latent dimensions: ' + str(n_latent))

    # Initialising
    print('Initialising')
    if (user_init_symmetric):
        user_latent = np.random.rand(n_users, n_latent) * 2 - 1
    else:
        user_latent = np.random.rand(n_users, n_latent)
    #user_latent /= np.linalg.norm(user_latent)
    #user_latent *= np.sqrt(np.sum(user_doc_sparse_mat.data*user_doc_sparse_mat.data))
    user_latent -= np.mean(user_latent, axis=0)
    user_latent /= (np.linalg.norm(user_latent, axis=1).reshape(
        (user_latent.shape[0], 1)) + 1e-60)
    #user_latent = user_doc_sparse_mat.dot(doc_original_latent) / (1+np.array(user_doc_sparse_mat.sum(axis=1)).reshape(user_doc_sparse_mat.shape[0],1))
    #user_latent /= ((np.linalg.norm(user_latent, axis = 1)*np.linalg.norm(user_latent, axis = 1)).reshape((user_latent.shape[0], 1))+1e-60)
    #user_latent *= 50
    #user_latent /= np.linalg.norm(user_latent,axis=0)
    doc_latent = doc_original_latent.copy()
    #doc_latent = doc_latent / np.linalg.norm(doc_latent, axis=0)

    # kappa_ and epsilon_ and gamma_ and n_iter are read from json; but alpha_ and lambda_ are given as inputs.

    params_dict = json.load(open('minibatch_settings.json', mode='r'))
    kappa_ = params_dict['kappa_']
    epsilon_ = params_dict['epsilon_']
    if gamma_ == -1:
        gamma_ = params_dict['gamma_']
    n_iter = params_dict['n_iter']
    zeta_ = params_dict['zeta_']
    #theta_ = params_dict['theta_']

    # These are the old values we used to use.
    # kappa_ = 10
    # epsilon_ = 20
    # gamma_ = 5e-2
    # n_iter = 200000
    # alpha_ = 1
    # lambda_ = 1e-3
    # theta_ = 1e-4

    errors_list = []
    print('Initialisation complete.')

    i = 0
    while i < n_iter:
        user_old = user_latent.copy()
        doc_old = doc_latent.copy()
        step_size = gamma_ / (1 + int(i / GAMMA_DECREASE_STEP))

        rand_choices = random.sample(range(0, n_nonzeros),
                                     int(0.9 * BATCH_SIZE))
        user_rand_indices = [
            user_doc_nonzero_indices[0][rand_index]
            for rand_index in rand_choices
        ]
        doc_rand_indices = [
            user_doc_nonzero_indices[1][rand_index]
            for rand_index in rand_choices
        ]
        user_rand_indices.extend([
            random.randint(0, n_users - 1)
            for j in range(0, BATCH_SIZE - len(rand_choices))
        ])
        doc_rand_indices.extend([
            random.randint(0, n_docs - 1)
            for j in range(0, BATCH_SIZE - len(rand_choices))
        ])

        for rand_index in range(0, len(user_rand_indices)):
            user_index = user_rand_indices[rand_index]
            doc_index = doc_rand_indices[rand_index]
            e_ui = user_doc_sparse_mat[user_index, doc_index]
            r_ui = int(e_ui > 0)
            c_ui = 1 + kappa_ * np.log(1 + e_ui / epsilon_)
            coef1 = -2 * c_ui * (
                r_ui - np.dot(user_old[user_index, :], doc_old[doc_index, :]))
            user_latent[user_index, :] -= step_size * (coef1 *
                                                       doc_old[doc_index, :])
            doc_latent[doc_index, :] -= step_size * (coef1 *
                                                     user_old[user_index, :])

        uri_set = set(user_rand_indices)
        dri_set = set(doc_rand_indices)
        for user_index in uri_set:
            user_latent[user_index, :] -= 2 * step_size * alpha_ * user_old[
                user_index, :]
        for doc_index in dri_set:
            doc_latent[doc_index, :] -= step_size * (
                2 * lambda_ *
                (doc_old[doc_index, :] - doc_original_latent[doc_index, :]) +
                zeta_ * (1.0 * (doc_old[doc_index, :] > 0).astype(int) - 1.0 *
                         (doc_old[doc_index, :] < 0).astype(int)))

        qtq_minus_diag = doc_old.transpose().dot(doc_old)
        qtq_minus_diag -= np.diag(np.diag(qtq_minus_diag))
        doc_latent -= 4 * theta_ * step_size * doc_old.dot(qtq_minus_diag)

        i += BATCH_SIZE
        print(i, alpha_, lambda_, theta_, gamma_, zeta_)
        #do_gd_step(user_doc_sparse_mat, user_latent, doc_latent, doc_original_latent, user_index,doc_index, i,
        #           kappa_, epsilon_, alpha_, lambda_, gamma_, theta_)

        if (i % (20 * BATCH_SIZE) == 0):
            print('Errors:')
            current_error = calc_error(user_doc_sparse_mat, user_latent,
                                       doc_latent, doc_original_latent, kappa_,
                                       epsilon_, alpha_, lambda_)
            print(current_error)
            #errors_list.append(current_error)
            print('----------------')

    print('Saving')

    make_sure_path_exists(add_slash_to_dir(output_dir))
    if (not do_validation or do_validation):
        f1 = open(add_slash_to_dir(output_dir) + 'user_latent.pickle',
                  mode='wb')
        pickle.dump(user_latent, f1)
        f1.close()

        f2 = open(add_slash_to_dir(output_dir) + 'doc_latent.pickle',
                  mode='wb')
        pickle.dump(doc_latent, f2)
        f2.close()

    f3 = open(add_slash_to_dir(output_dir) + 'params.json', mode='w')
    json.dump(
        {
            'alpha_': alpha_,
            'lambda_': lambda_,
            'gamma_': gamma_,
            'n_iter': n_iter,
            'kappa_': kappa_,
            'epsilon_': epsilon_,
            'theta_': theta_,
            'zeta_': zeta_,
            'BATCH_SIZE': BATCH_SIZE
        }, f3)
    f3.close()

    if (do_validation):
        k_topics = 50
        print('Calculating validation errors:')
        pred_error, n_distinct_val_users = prediction_error_pair_sqerr(
            user_doc_sparse_mat_original, user_latent, doc_latent,
            validation_pairs, kappa_, epsilon_)
        #The maximum cohesion score ever possible is 2*k_topics (possible if all top and bottom scores come out as 1)
        cohesion_score, _, _ = calc_cohesion_score(doc_latent,
                                                   doc_original_latent,
                                                   k=k_topics)
        error_dict = {
            'prediction_error': pred_error,
            'cohesion_score': cohesion_score,
            'n_validation_users': n_distinct_val_users,
            'k_topics': k_topics,
            'n_validation_nonzeros': len(validation_pairs[0])
        }
        json.dump(error_dict,
                  open(add_slash_to_dir(output_dir) + 'errors.json', mode='w'))
        print('Prediction error:')
        print(pred_error)
        print('Cohesion score:')
        print(cohesion_score)
Ejemplo n.º 13
0
def main():
    usage_str = 'Shows interpretations of singular vectors or principal components. The vectors are assumed to be in ' \
                'column form, i.e. a 2d array of shape (n_dims, n_components). The output consists of the questions ' \
                'both in txt and json formats, and the column-normalised latent representations ' \
                'used for recommendation plus the list of ' \
                'article ids that should be avoided since they appear in the questions.\n' \
                '1. Name of doc names file (mapping of doc name to doc id)\n' \
                '2. Dir of id to column index mapping dict\n' \
                '3. Name of pickle file containing the Q matrix. (the outputs will be saved in the same dir)\n' \
                '4. Optional, number of questions. If not provided, generates all the questions.'
    if (len(sys.argv) < 4 or len(sys.argv) > 5):
        print(usage_str)
        return
    name_filename = sys.argv[1]
    dict_dir = sys.argv[2]
    vectors_file_name = sys.argv[3]
    n_questions = -1
    if (len(sys.argv) == 5):
        try:
            n_questions = int(sys.argv[4])
            if (n_questions < 1):
                print(usage_str)
                return
        except:
            print(usage_str)
            return

    col_dict = json.load(
        open(add_slash_to_dir(dict_dir) + 'col_dict.json', mode='r'))
    col_dict = intify_dict(invert_dict(col_dict))
    names_dict = get_id_name_dict(name_filename)

    Q = np.array(pickle.load(open(vectors_file_name, mode='rb')))
    if (n_questions == -1 or n_questions > Q.shape[1]):
        n_questions = Q.shape[1]
    Q = Q[:, :n_questions]

    out_file_txt = open(vectors_file_name + '_interpreted_' +
                        str(n_questions) + '.txt',
                        mode='w')
    out_file_json = open(vectors_file_name + '_questions_dict_' +
                         str(n_questions) + '.json',
                         mode='w')

    latent_reps, id_avoid_list = create_questions(col_dict, Q, names_dict,
                                                  out_file_txt, out_file_json)

    out_file_txt.close()
    out_file_json.close()

    name_avoid_list = [names_dict[x] for x in id_avoid_list]

    pickle.dump(
        latent_reps,
        open(vectors_file_name + '_latent_rep_' + str(n_questions) + '.pkl',
             mode='wb'))
    pickle.dump(
        id_avoid_list,
        open(vectors_file_name + '_id_avoid_list_' + str(n_questions) + '.pkl',
             mode='wb'))
    pickle.dump(
        name_avoid_list,
        open(vectors_file_name + '_name_avoid_list_' + str(n_questions) +
             '.pkl',
             mode='wb'))