strings_y=["QAutf8"], feat_y=["QApos",]) #"QAaspect", "QAperson", "QAgender", "QAnumber", "QAcase", "QAvoice", "QAmood", "QAstate"]) source = list(itertools.chain(*sawarefData.quran_sent)) df = pd.DataFrame(source, columns=["sid", "aid", "wid", "mid"] + feat_x + ["embeddings","word","QAutf8"] + feat_y) df["embeddings"] = df["embeddings"].apply(truncate) df = flattencolumns(df, ["embeddings"]) df.set_index(["sid", "aid", "wid", "mid"], inplace=True) df.sort_index(inplace=True) ## 2. Pad the rows according to the longest word (in # of morphemes) SENTLEN = max(df.index.get_level_values("mid")) df = df.reindex(padIndexes( df, max(df.index.get_level_values("mid"))), fill_value=0).sort_index() ## 3. Get the hot encoding of all caterogirical data (see columns attr) dumm = pd.get_dummies(df, columns=feat_x + feat_y) ## 4. Add two-level columns for easy indexing later (wid, mid) EXAMPLES_LEN = df.shape[0]//SENTLEN new_columns = [] for x in dumm.columns: new_columns.append(re.sub('(_.*|[0-9]*)', '', x)) dumm.columns = [new_columns, dumm.columns] dumm.index = [[x for x in range(EXAMPLES_LEN) for _ in range(SENTLEN)], [x for _ in range(EXAMPLES_LEN) for x in range(SENTLEN)]] dumm = dumm.sort_index(axis=1)
df = pd.concat( [pd.DataFrame(df1[x].values.tolist()).add_prefix(x) for x in cols], axis=1) return pd.concat([df, df1.drop(cols, axis=1)], axis=1) def truncate(x): return x[:EMBEDDINGS] df["embeddings"] = df["embeddings"].apply(truncate) df = flattencolumns(df, ["embeddings"]) df.set_index(["sid", "aid", "wid", "mid"], inplace=True) df.sort_index(inplace=True) SENTLEN = max(df.index.get_level_values("mid")) df = df.reindex(padIndexes(df, max(df.index.get_level_values("mid"))), fill_value=0).sort_index() dumm = pd.get_dummies( df, columns=sawarefData.features_map_x + sawarefData.features_map_y ) #.reset_index().set_index("mid") #.drop(["sid", "aid", "wid"], 1) print("Done") # dumm = dumm.reindex(padIndexes(dumm, max(df.index.get_level_values("mid"))), fill_value=0.0).sort_index() # x_columns = [k + "_" + xx for k, x in sawarefData.features_set_x.items() # for xx in x] # y_columns = [k + "_" + xx for k, x in sawarefData.features_set_y.items() # for xx in x] x_columns = [ y for f in sawarefData.features_map_x for y in dumm.columns if y.replace(f, "")[0] == "_" ] + ["embeddings" + str(i) for i in range(EMBEDDINGS)]