Beispiel #1
0
def window_column(paths, output_file, debug=False):
    try:
        os.remove(output_file)
    except FileNotFoundError:
        print("Creating new file for writing data")

    total = len(paths)
    current = 0
    for path in paths:
        if debug:
            print(str(current) + "/" + str(total))
            current += 1
        df = pd.read_csv(path, encoding='latin1')
        # Check for valid relations only
        if not dpu.valid_relation(df):
            continue
        columns = df.columns
        f = csv.writer(open(output_file, 'a'),
                       delimiter=',',
                       quotechar='\"',
                       quoting=csv.QUOTE_MINIMAL)
        # Columns
        for c in columns:
            col_data = df[c]
            row = [
                dpu.encode_cell(cell_value) for cell_value in col_data
                if dpu.valid_cell(cell_value)
            ]
            if len(row) > 0:
                f.writerow(row)
        # TODO: why is it necessary to indicate end of relation?
        f.writerow(["~R!RR*~"])
Beispiel #2
0
def column_avg_unique_composition(df, we_model):
    column_we = dict()
    columns = df.columns
    missing_words = 0
    for c in columns:
        col_wes = []
        value = df[c].unique()
        for el in value:
            # Check validity of cell
            if not dpu.valid_cell(el):
                continue
            el = dpu.encode_cell(el)
            if " " in el:
                els = el.split(" ")
                vector = we_model.get_vector(els[0])
                missing_words_mini = 0
                for ee in range(1, len(els)):
                    try:
                        vector += we_model.get_vector(els[1])
                    except KeyError:
                        missing_words += 1
                        missing_words_mini += 1
                vector /= (len(els) - missing_words_mini)
            else:
                try:
                    vector = we_model.get_vector(el)
                except KeyError:
                    missing_words += 1
                    continue
            col_wes.append(vector)
        col_wes = np.asarray(col_wes)
        col_we = np.mean(col_wes, axis=0)
        column_we[c] = col_we
    return column_we, missing_words
Beispiel #3
0
def row_avg_composition(df, we_model):
    missing_words = 0
    row_we_dict = dict()
    columns = df.columns
    for i, row in df.iterrows():
        row_wes = []
        for c in columns:
            # Check validity of cell
            if not dpu.valid_cell(row[c]):
                continue
            el = dpu.encode_cell(row[c])
            if " " in el:
                els = el.split(" ")
                vector = we_model.get_vector(els[0])
                missing_words_mini = 0
                for ee in range(1, len(els)):
                    try:
                        vector += we_model.get_vector(els[1])
                    except KeyError:
                        missing_words += 1
                        missing_words_mini += 1
                vector /= (len(els) - missing_words_mini)
            else:
                try:
                    vector = we_model.get_vector(el)
                except KeyError:
                    missing_words += 1
                    continue
            row_wes.append(vector)
        row_wes = np.asarray(row_wes)
        row_we = np.mean(row_wes, axis=0)
        row_we_dict[i] = row_we
    return row_we_dict, missing_words
Beispiel #4
0
def _read_columns_from_dataframe(df, columns):
    for c in columns:
        data_values = df[c]
        for cell_value in data_values:
            # We check the cell value is valid before continuing
            if not dpu.valid_cell(cell_value):
                continue
            cell_value = dpu.encode_cell(cell_value)
            yield cell_value
Beispiel #5
0
def _read_rows_from_dataframe(df, columns):
    for index, el in df.iterrows():
        for c in columns:
            cell_value = el[c]
            # We check the cell value is valid before continuing
            if not dpu.valid_cell(cell_value):
                continue
            # If valid, we clean and format it and return it
            cell_value = dpu.encode_cell(cell_value)
            yield cell_value