def window_column(paths, output_file, debug=False): try: os.remove(output_file) except FileNotFoundError: print("Creating new file for writing data") total = len(paths) current = 0 for path in paths: if debug: print(str(current) + "/" + str(total)) current += 1 df = pd.read_csv(path, encoding='latin1') # Check for valid relations only if not dpu.valid_relation(df): continue columns = df.columns f = csv.writer(open(output_file, 'a'), delimiter=',', quotechar='\"', quoting=csv.QUOTE_MINIMAL) # Columns for c in columns: col_data = df[c] row = [ dpu.encode_cell(cell_value) for cell_value in col_data if dpu.valid_cell(cell_value) ] if len(row) > 0: f.writerow(row) # TODO: why is it necessary to indicate end of relation? f.writerow(["~R!RR*~"])
def column_avg_unique_composition(df, we_model): column_we = dict() columns = df.columns missing_words = 0 for c in columns: col_wes = [] value = df[c].unique() for el in value: # Check validity of cell if not dpu.valid_cell(el): continue el = dpu.encode_cell(el) if " " in el: els = el.split(" ") vector = we_model.get_vector(els[0]) missing_words_mini = 0 for ee in range(1, len(els)): try: vector += we_model.get_vector(els[1]) except KeyError: missing_words += 1 missing_words_mini += 1 vector /= (len(els) - missing_words_mini) else: try: vector = we_model.get_vector(el) except KeyError: missing_words += 1 continue col_wes.append(vector) col_wes = np.asarray(col_wes) col_we = np.mean(col_wes, axis=0) column_we[c] = col_we return column_we, missing_words
def row_avg_composition(df, we_model): missing_words = 0 row_we_dict = dict() columns = df.columns for i, row in df.iterrows(): row_wes = [] for c in columns: # Check validity of cell if not dpu.valid_cell(row[c]): continue el = dpu.encode_cell(row[c]) if " " in el: els = el.split(" ") vector = we_model.get_vector(els[0]) missing_words_mini = 0 for ee in range(1, len(els)): try: vector += we_model.get_vector(els[1]) except KeyError: missing_words += 1 missing_words_mini += 1 vector /= (len(els) - missing_words_mini) else: try: vector = we_model.get_vector(el) except KeyError: missing_words += 1 continue row_wes.append(vector) row_wes = np.asarray(row_wes) row_we = np.mean(row_wes, axis=0) row_we_dict[i] = row_we return row_we_dict, missing_words
def _read_columns_from_dataframe(df, columns): for c in columns: data_values = df[c] for cell_value in data_values: # We check the cell value is valid before continuing if not dpu.valid_cell(cell_value): continue cell_value = dpu.encode_cell(cell_value) yield cell_value
def _read_rows_from_dataframe(df, columns): for index, el in df.iterrows(): for c in columns: cell_value = el[c] # We check the cell value is valid before continuing if not dpu.valid_cell(cell_value): continue # If valid, we clean and format it and return it cell_value = dpu.encode_cell(cell_value) yield cell_value