def compute_num_terms(): map = defaultdict(int) files = list_files_in_directory("/Users/ra-mit/data/mitdwhdata") total_files = len(files) iteration = 0 for f in files: print("Processing: " + str(f)) df = read_csv_file(f) columns = df.columns for c in columns: clean_tokens = tp.tokenize(c, " ") for ct in clean_tokens: map[ct] += 1 print(str(iteration) + "/" + str(total_files)) iteration += 1 #if iteration > 5: # continue print("Size: " + str(len(map))) it = csv_iterator(f) for tuple in it: clean_tokens = tp.tokenize(tuple, ",") for ct in clean_tokens: map[ct] += 1 ordered = sorted(map.items(), key=lambda x: x[1], reverse=True) for el in ordered: print(str(el))
def process_file(path, term_map=defaultdict(int)): print("Processing: " + str(path)) df = ca.read_csv_file(path) columns = df.columns for c in columns: clean_tokens = tp.tokenize(c, " ") for ct in clean_tokens: term_map[ct] += 1 it = ca.csv_iterator(path) for tuple in it: clean_tokens = tp.tokenize(tuple, ",") for ct in clean_tokens: term_map[ct] += 1 return term_map
def iterate_columns_with_header(path): dataframe = pd.read_csv(path, encoding='latin1') columns = dataframe.columns for c in columns: clean_tokens = [] data = dataframe[c] for el in data: if type(el) is str: ct = tp.tokenize(el, " ") for t in ct: clean_tokens.append(t) tuple = ','.join(clean_tokens) col_header = [] clean_tokens = tp.tokenize(c, " ") for ct in clean_tokens: col_header.append(ct) header = ','.join(clean_tokens) yield tuple, header
def get_header(path): dataframe = pd.read_csv(path, encoding='latin1') columns = dataframe.columns clean_tokens = [] for c in columns: ct = tp.tokenize(c, " ") for t in ct: clean_tokens.append(t) clean_tuple = ','.join(clean_tokens) return clean_tuple
def iterate_rows_with_header(path): dataframe = pd.read_csv(path, encoding='latin1') columns = dataframe.columns for index, el in dataframe.iterrows(): row = [] for c in columns: value = el[c] if type(value) is str: ct = tp.tokenize(value, " ") for t in ct: row.append(t) tuple = ','.join(row) yield tuple
def iterate_rows_no_header(path, token_joiner=","): dataframe = pd.read_csv(path, encoding='latin1') columns = dataframe.columns for index, el in dataframe.iterrows(): row = [] for c in columns: value = el[c] if type(value) is str: value = value.replace(",", ' ') ct = tp.tokenize(value, " ") tuple = token_joiner.join(ct) row.append(tuple) yield row
def csv_iterator_with_header(path): dataframe = pd.read_csv(path, encoding='latin1') columns = dataframe.columns clean_col_tokens = set() for c in columns: toks = tp.tokenize(c, " ") for t in toks: clean_col_tokens.add(t) for index, row in dataframe.iterrows(): tuple_list = [] for col in columns: tuple_list.append(str(row[col])) for c in clean_col_tokens: tuple_list.append(c) tuple = ','.join(tuple_list) yield tuple
def iterate_columns_no_header(path, token_joiner=",", verbose=False): dataframe = pd.read_csv(path, encoding='latin1') columns = dataframe.columns for c in columns: # clean_tokens = [] if verbose: print("Col: " + str(c)) data = dataframe[c] col = [] for el in data: if type(el) is str: el = el.replace(",", ' ') ct = tp.tokenize(el, " ") tuple = token_joiner.join(ct) col.append(tuple) yield col
def init(path_to_data=None, path_to_vocab=None, path_to_location=None, path_to_model=None, path_to_ae_model=None, path_to_vae_model=None, path_to_fqa_model=None, path_to_bae_model=None, bae_model_epoch=None, encoding_mode="onehot", where_is_fabric=False): #mit_dwh_vocab = U.get_tf_dictionary(path_to_vocab) tf_vocab = None with open(path_to_vocab, 'rb') as f: tf_vocab = pickle.load(f) global vocab vocab = tf_vocab global inv_vocab inv_vocab = dict() for k, v in vocab.items(): inv_vocab[v] = k location_dic = None if path_to_location is not None: with open(path_to_location + config.LOC_DICTIONARY + ".pkl", 'rb') as f: location_dic = pickle.load(f) with open(path_to_location + config.INV_LOC_DICTIONARY + ".pkl", 'rb') as f: inv_location_dic = pickle.load(f) global location_dic location_dic = location_dic global inv_location_dic inv_location_dic = inv_location_dic if path_to_model is not None: global model model = mc.load_model_from_path(path_to_model) global emode emode=encoding_mode if path_to_ae_model is not None: #ae_model = ae.load_model_from_path(path_to_ae_model) global encoder encoder = ae.load_model_from_path(path_to_ae_model + "/ae_encoder.h5") global decoder decoder = ae.load_model_from_path(path_to_ae_model + "/ae_decoder.h5") if path_to_fqa_model is not None: global fqa_model fqa_model = fqa.load_model_from_path(path_to_fqa_model + "fqa.h5") if path_to_vae_model is not None: global vae_encoder vae_encoder = vae.load_model_from_path(path_to_vae_model + "/vae_encoder.h5") global vae_generator vae_generator = vae.load_model_from_path(path_to_vae_model + "/vae_generator.h5") if path_to_bae_model is not None: if bae_model_epoch is not None: enc_name = "/bae_encoder" + str(bae_model_epoch) + ".h5" dec_name = "/bae_decoder" + str(bae_model_epoch) + ".h5" else: enc_name = "/bae_encoder.h5" dec_name = "/bae_decoder.h5" # ae_model = ae.load_model_from_path(path_to_ae_model) global bae_encoder bae_encoder = bae.load_model_from_path(path_to_bae_model + enc_name) global bae_decoder bae_decoder = bae.load_model_from_path(path_to_bae_model + dec_name) if where_is_fabric: #fabric_encoder = ae.load_model_from_path(path_to_ae_model + "/ae_encoder.h5") #bae_encoder = bae.load_model_from_path(path_to_ae_model + "/ae_encoder.h5") # compute max_v and min_v # max_v, min_v, mean_v, std_v = find_max_min_mean_std_per_dimension(path_to_data, bae_encoder) def embed_vector(v): x = v.toarray()[0] x_embedded = bae_encoder.predict(np.asarray([x])) #x_embedded = normalize_to_unitrange_per_dimension(x_embedded[0], max_vector=max_v, min_vector=min_v) #x_embedded = normalize_per_dimension(x_embedded[0], mean_vector=mean_v, std_vector=std_v) #v = v[0] #v = normalize_per_dimension(v, mean_vector=mean_v, std_vector=std_v) # normalization for the binary fabric zidx = np.where(x_embedded[0] > 0.66) oidx = np.where(x_embedded[0] < 0.33) x_embedded.fill(0.5) x_embedded[0][zidx[0]] = 0 x_embedded[0][oidx[0]] = 1 return x_embedded #return v global normalizeFVector normalizeFVector = NormalizeFVectors(normalize_function=embed_vector) global where_is_use_fabric where_is_use_fabric = where_is_fabric if encoding_mode == "onehot": tf_vectorizer = CountVectorizer(max_df=1., min_df=0, encoding='latin1', tokenizer=lambda text: tp.tokenize(text, " "), vocabulary=tf_vocab, stop_words='english') global vectorizer vectorizer = tp.CustomVectorizer(tf_vectorizer) elif encoding_mode == "index": idx_vectorizer = IndexVectorizer(vocab_index=vocab) global vectorizer vectorizer = tp.CustomVectorizer(idx_vectorizer)
pronouns = ["She", "He", "she", "he"] all_files = csv_access.list_files_in_directory(path) #all_files = [all_files[0]] for fpath in all_files: name = (fpath.split("/")[-1]).split(".")[0] pre_processed_tokens = [] with open(fpath, "r") as f: relations = f.readlines() for r in relations: tokens = r.split(" ")[1::] # remove number pre_tokens = tp.tokenize(" ".join(tokens), " ") # clean stuff pre_tokens = [el.strip() for el in pre_tokens] # change pronouns by names for idx in range(len(pre_tokens)): tk = pre_tokens[idx] if tk in pronouns: pre_tokens[idx] = name # # add name if not present already # if name not in pre_tokens: # pre_tokens.append(name) if len(pre_tokens) > 0: pre_processed_tokens.append(set(pre_tokens)) # Remove near-duplicates idx_to_remove = set() for i in range(len(pre_processed_tokens)):
for fpath in all_files: name = (fpath.split("/")[-1]).split(".")[0] # append_header(fpath, out_path, name) # continue lines = [] print("Processing: " + str(fpath)) df = pd.read_csv(fpath, encoding='latin1') for index, row in df.iterrows(): s = row['s'] p = row['p'] o = row['o'] # clean stuff s_tokens = tp.tokenize(s, " ", min_token_length=1) s_tokens = [el.strip() for el in s_tokens] for idx in range(len(s_tokens)): if s_tokens[idx] in pronouns: s_tokens[idx] = name p_tokens = tp.tokenize(p, " ", min_token_length=1) p_tokens = [el.strip() for el in p_tokens] for idx in range(len(p_tokens)): if p_tokens[idx] in pronouns: p_tokens[idx] = name o_tokens = tp.tokenize(o, " ", min_token_length=1) o_tokens = [el.strip() for el in o_tokens] for idx in range(len(o_tokens)): if o_tokens[idx] in pronouns: o_tokens[idx] = name clean_s = " ".join(s_tokens)
it = csv_iterator(f) for tuple in it: clean_tokens = tp.tokenize(tuple, ",") for ct in clean_tokens: map[ct] += 1 ordered = sorted(map.items(), key=lambda x: x[1], reverse=True) for el in ordered: print(str(el)) files = list_files_in_directory("/Users/ra-mit/data/mitdwhdata") for f in files: print(str(f)) example = files[0] print(example) it = csv_iterator_with_header(example) from preprocessing import text_processor as tp for tuple in it: clean_tokens = tp.tokenize(tuple, ",") print(str(clean_tokens)) # print("Computing number of terms...") # # compute_num_terms() # # print("Computing number of terms...OK")