def get_example_data():

    # read the tensor from the folder passed by args
    data_file_prefix = sys.argv[1]
    header_file = data_file_prefix + '/headers.txt'
    data_files = [data_file_prefix + "/connection.mtx",
                  data_file_prefix + "/needtype.mtx",
                  data_file_prefix + "/subject.mtx",
                  data_file_prefix + "/content.mtx",
                  data_file_prefix + "/category.mtx"]
    slices = [SparseTensor.CONNECTION_SLICE, SparseTensor.NEED_TYPE_SLICE, SparseTensor.ATTR_SUBJECT_SLICE,
              SparseTensor.ATTR_CONTENT_SLICE, SparseTensor.CATEGORY_SLICE]

    tensor = read_input_tensor(header_file, data_files, slices, False)

    data = []
    target = []

    # Store the chosen input into lists.
    for need_index in tensor.getNeedIndices():
        content = ""
        category_index = tensor.getSliceMatrix(SparseTensor.CATEGORY_SLICE)[need_index,].nonzero()[1].tolist()
        target.append(category_index)
        for word in tensor.getAttributesForNeed(need_index, SparseTensor.ATTR_SUBJECT_SLICE):
            content += word + " "
        data.append(content)

    # Print out the input, just a check:
    target_names = tensor.getHeaders()
    print("test")
    print data
    print target_names
    print target

    return data, target, target_names
def get_example_data():

    # read the tensor from the folder passed by args
    data_file_prefix = sys.argv[1]
    header_file = data_file_prefix + '/headers.txt'
    data_files = [data_file_prefix + "/connection.mtx",
                  data_file_prefix + "/needtype.mtx",
                  data_file_prefix + "/subject.mtx",
                  data_file_prefix + "/content.mtx",
                  data_file_prefix + "/category.mtx"]
    slices = [SparseTensor.CONNECTION_SLICE, SparseTensor.NEED_TYPE_SLICE, SparseTensor.ATTR_SUBJECT_SLICE,
              SparseTensor.ATTR_CONTENT_SLICE, SparseTensor.CATEGORY_SLICE]

    tensor = read_input_tensor(header_file, data_files, slices, False)

    data = []
    target = []

    # Store the chosen input into lists.
    # The "if" statement is meant to include only samples with a single category (No multilabel)
    for need_index in tensor.getNeedIndices():
        content = ""
        categories = tensor.getAttributesForNeed(need_index, SparseTensor.CATEGORY_SLICE)
        numCategories = len(categories)
        if numCategories >= 1:
            category_index = tensor.getSliceMatrix(SparseTensor.CATEGORY_SLICE)[need_index,].nonzero()[1][0]
            target.append(category_index)
            for word in tensor.getAttributesForNeed(need_index, SparseTensor.ATTR_SUBJECT_SLICE):
                content += word + " "
            data.append(content)

    # Include only few of all the categories (e.g. with samples > n)
    newdata = []
    newtarget = []
    for i in range(len(target)):

        if target.count(target[i]) > 50:
            newtarget.append(target[i])
            newdata.append(data[i])

    data = newdata
    target = newtarget

    # Print out the input, just a check:
    target_names = tensor.getHeaders()
    print("test")
    print data
    print target_names
    print target

    return data, target, target_names
Beispiel #3
0
                        help="threshold of rescal algorithm to produce hints")
    args = parser.parse_args()

    # load the tensor
    header_file = "headers.txt"

    slice_files = []
    for file in os.listdir(args.inputfolder):
        if file.endswith(".mtx"):
            slice_files.append(file)

    header_input = args.inputfolder + "/" + header_file
    data_input = []
    for slice in slice_files:
        data_input.append(args.inputfolder + "/" + slice)
    input_tensor = read_input_tensor(header_input, data_input, True)

    # execute rescal
    A, R = execute_extrescal(input_tensor, args.rank)

    # predict new hints
    _log.info("predict hints with threshold: %f" % args.threshold)
    mask_matrix = create_hint_mask_matrix(input_tensor)
    connection_prediction = predict_rescal_hints_by_threshold(
        A, R, args.threshold, mask_matrix)
    _log.info("number of hints created: %d" %
              len(connection_prediction.nonzero()[0]))

    # write the hint output matrix
    output = args.outputfolder + "/" + "hints.mtx"
    _log.info("write hint prediction output matrix: " + output)
Beispiel #4
0
    # load the tensor
    header_file = "headers.txt"
    atom_indices_file = "atomIndices.txt"

    slice_files = []
    for file in os.listdir(args.inputfolder):
        if file.endswith(".mtx"):
            slice_files.append(file)

    header_input = args.inputfolder + "/" + header_file
    atom_indices_input = args.inputfolder + "/" + atom_indices_file
    data_input = []
    for slice in slice_files:
        data_input.append(args.inputfolder + "/" + slice)
    input_tensor = read_input_tensor(header_input, atom_indices_input,
                                     data_input, True)

    # execute rescal
    A, R = execute_extrescal(input_tensor, args.rank)

    # predict new hints
    _log.info("predict hints with threshold: %f" % args.threshold)
    connection_prediction = predict_rescal_hints_by_threshold(
        A, R, args.threshold, input_tensor)

    _log.info("number of hints created: %d" %
              len(connection_prediction.nonzero()[0]))

    # write the hint output matrix
    output = args.outputfolder + "/" + "hints.mtx"
    _log.info("write hint prediction output matrix: " + output)
 def __init__(self, args, output_folder, logger, ground_truth, start_time):
     self.init(args, output_folder, logger, ground_truth, start_time)
     header_input = args.inputfolder + "/" + args.headers
     self.file_prediction_tensor = read_input_tensor(
         header_input, [args.prediction_matrix_file], [SparseTensor.CONNECTION_SLICE], True)
    if args.outputfolder:
        outfolder = args.outputfolder
    else:
        outfolder = folder + "/out/" + start_time
    if not os.path.exists(outfolder):
        os.makedirs(outfolder)
    hdlr = logging.FileHandler(outfolder + "/eval_result_" + start_time + ".log")
    _log.addHandler(hdlr)

    # load the tensor input data
    data_input = [folder + "/" + args.connection_slice, folder + "/" + args.needtype_slice]
    for slice in args.additional_slices:
        data_input.append(folder + "/" + slice)
    header_input = folder + "/" + args.headers
    slices = SparseTensor.defaultSlices + [SparseTensor.ATTR_CONTENT_SLICE, SparseTensor.CATEGORY_SLICE]
    input_tensor = read_input_tensor(header_input, data_input, slices, True)

    # TEST-PARAMETERS:
    # ===================

    # (10-)fold cross validation
    FOLDS = args.folds

    # True means: for testing mask all connections of random test needs (Test Case: Predict connections for new need
    # without connections)
    # False means: for testing mask random connections (Test Case: Predict connections for existing need which may
    # already have connections)
    MASK_ALL_CONNECTIONS_OF_TEST_NEED = not args.maskrandom

    # by changing this parameter the number of training connections per need can be set. Choose a high value (e.g.
    # 100) to use all connection in the connections file. Choose a low number to restrict the number of training