Example #1
0
def parse_data(data_file_full_path):
    """ This method parses the data into the final matrix [M x N] - called X matrix.
        and Nx1 vector of classifier results - Y vector.
    """

    final_x_matrix = list()
    final_y_vector = list()

    try:

        data_file = open(data_file_full_path)
        for line in data_file:
            split_line = line.split(', ')

            if split_line.__contains__("?"):
                x_value, y_value = parse(split_line)

                # Adding median as a feature
                x_value.append(statistics.median(x_value))

                # Adding mean as a feature
                x_value.append(statistics.mean(x_value))

                # Adding variance as a feature
                x_value.append(statistics.variance(x_value))

                final_x_matrix.append(x_value)
                final_y_vector.append(y_value)

    except Exception as err:
        print("Error: ", err)

    finally:
        return final_x_matrix, final_y_vector
def parse_data(data_file_full_path):
    """ This method parses the data into the final matrix [M x N] - called X matrix.
        and Nx1 vector of classifier results - Y vector.
    """
    f = open(data_file_full_path)
    final_x_matrix = list()
    final_y_vector = list()
    lines = f.readlines(
    )  # Creates a list, each element in the list is a line in the data file
    data_size = len(lines)  # Original data size (amount of objects provided)
    for line in lines:
        # creates a list presentation of 'lines', with the relevant values for each feature
        line_list_presentation = line.replace(',', '').replace('\n',
                                                               '').split(' ')
        try:
            #test whether the object holds valid data and fix relevant features values if possible
            line_list_presentation = data_valid_fixer(line_list_presentation)
        except ValueError:
            #if one(or more) of the features holds non-valid value - it is ignored
            continue
        x, y_value = parse(line_list_presentation)
        #append parsed vector to x_matrix, and parsed label to y_vector
        final_x_matrix.append(x)
        final_y_vector.append(y_value)
    return data_size, len(final_y_vector), final_x_matrix, final_y_vector
Example #3
0
def parse_test_data(test_file_full_path, means_and_frequents):
    f = open(test_file_full_path)
    missingMatrix = list()
    final_x_matrix = list()
    final_y_vector = list()

    # splitting the data for the rows
    file_input = f.read().split('\n')

    for row in file_input:
        missingMatrix.append(row.split(', '))

    # removing the first and last entry to avoid list index out of range error
    missingMatrix.remove(missingMatrix[0])
    missingMatrix.pop()

    # replace the missing values with the means and most frequents
    filledMatrix = addMissingValues(missingMatrix, means_and_frequents)

    for row in filledMatrix:
        newRow = parse(row)
        final_x_matrix.append(newRow[0])
        final_y_vector.append(newRow[1])

    return final_x_matrix, final_y_vector
Example #4
0
def parse_data(data_file_full_path):
    """ This method parses the data into the final matrix [M x N] - called X matrix.
        and Nx1 vector of classifier results - Y vector.
    """

    final_x_matrix = list()
    final_y_vector = list()

    f = open(data_file_full_path)
    for line in f:
        ans = parse(line.split(", "))
        final_x_matrix.append(ans[0])
        final_y_vector.append(ans[1])
    f.close()
    return final_x_matrix, final_y_vector
Example #5
0
def parse_data(data_file_full_path):
    """ This method parses the data into the final matrix [M x N] - called X matrix.
        and Nx1 vector of classifier results - Y vector.
    """
    final_x_matrix = list()
    final_y_vector = list()
    missing_vectors_x = list()
    missing_vectors_y = list()
    avg = [0] * 14
    count = 0

    f = open(data_file_full_path, "r")
    for line in f:
        if line.startswith('|'):
            continue

        is_missed = line.find("?") > 0
        splited_line = line.split(', ')

        x, y = parse(splited_line)

        # handling missed data: put feature average on missed feature
        count += 1
        for col in range(0, len(x)):
            if int(x[col]) > -1:
                avg[col] += int(x[col])

        if not is_missed:
            final_x_matrix.append(x)
            final_y_vector.append(y)
        else:
            missing_vectors_x.append(x)
            missing_vectors_y.append(y)

    # calculate average of each feature and replace missing values
    for i in range(0, len(avg)):
        avg[i] = avg[i] / count

    for i in range(0, len(missing_vectors_x)):
        for j in range(0, len(missing_vectors_x[i])):
            if missing_vectors_x[i][j] < 0:
                missing_vectors_x[i][j] = avg[j]

    final_x_matrix += missing_vectors_x
    final_y_vector += missing_vectors_y
    f.close()

    return final_x_matrix, final_y_vector
Example #6
0
def parse_data(data_file_full_path):
    """ This method parses the data into the final matrix [M x N] - called X matrix.
        and Nx1 vector of classifier results - Y vector.
        Handling the missing data by skipping people with missing feature data
        (feature=? or len of feature vector!=15)
    """
    unknown = "?"
    max_row_len = 15
    final_x_matrix = []
    final_y_vector = []

    with open(data_file_full_path) as f:
        for line in f:
            # TODO - think about readlines() in order to access the file just once
            row = [x.strip() for x in line.split(',')]
            if unknown not in row and len(row) == max_row_len:
                x, y = parse(row)
                final_x_matrix.append(x)
                final_y_vector.append(y)

    return final_x_matrix, final_y_vector
def parse_data(data_file_full_path):
    """ This method parses the data into the final matrix [M x N] - called X matrix.
        and Nx1 vector of classifier results - Y vector.
    """
    corrupted_data = False
    f = open(data_file_full_path)
    final_x_matrix = list()
    final_y_vector = list()
    for line in f.readlines():
        if MISSING_DATA not in line:  # we'll skip lines with partial data
            data = parse_line(line)
            if len(
                    data
            ) == DATA_FEATURES:  # make sure the data line has all the features
                x, y = parse(data)
                if x is not None and y is not None:
                    final_x_matrix.append(x)
                    final_y_vector.append(y)
                else:
                    corrupted_data = True
    return final_x_matrix, final_y_vector, corrupted_data
Example #8
0
def parse_data(data_file_full_path):
    """ This method parses the data into the final matrix [M x N] - called X matrix.
        and Nx1 vector of classifier results - Y vector.
    """

    f = open(data_file_full_path)
    final_x_matrix = list()
    final_y_vector = list()
    missingMatrix = list()

    columns = [
        "age", "workclass", "fnlwgt", "education", "education-num",
        "martial-status", "occupation", "relationship", "race", "sex",
        "capital-gain", "capital-loss", "hours-per-week", "native-country",
        "salary"
    ]

    # splitting the data for each row, splitting for every entry using (", ")
    file_input = f.read().split('\n')
    for row in file_input:
        missingMatrix.append(row.split(', '))

    # read the data as csv with the column names, to be fed into get means and most frequents
    data = pd.read_csv(data_file_full_path, names=columns)
    means_and_most_frequents = getMeansAndMostFrequent(data)

    # replacing the missing values with the data in means and most frequents vector
    filledMatrix = addMissingValues(missingMatrix, means_and_most_frequents)

    # popping the last empty row
    filledMatrix.pop()

    # parse each row to be replaced with numbers from the parse function
    for row in filledMatrix:
        newrow = parse(row)
        final_x_matrix.append(newrow[0])
        final_y_vector.append(newrow[1])

    # return X matrix, y vector, and means and most frequents vector - to be used in parse test data function
    return final_x_matrix, final_y_vector, means_and_most_frequents