def __init__(self, rawdata_dir, max_docs_per_item=50, gold_sterr=0.5, n_items=0): self.df = pd.read_csv(rawdata_dir, sep=" ", error_bad_lines=False, names=["topic_item", "na", "doc", "gold"]) self.df = self.df[self.df.na == 0] self.df = self.trunc_docs(max_docs_per_item) topics = self.df.topic_item.unique() n_items = max(n_items, len(topics)) extradfs = [] for i in range(n_items - len(topics)): topic = np.random.choice(topics) tdf = self.df[self.df.topic_item == topic] new_topic = topic * (10 + i) newdf = tdf.copy(deep=True) newdf.topic_item = new_topic newdf.gold = np.random.permutation(newdf.gold.values) extradfs.append(newdf) if len(extradfs): self.df = pd.concat([self.df] + extradfs) self.df.gold = self.df.gold + np.random.normal(0, gold_sterr, len(self.df.gold)) self.topic_lookup = utils.make_categorical(self.df, "topic_item") self.doc_lookup = utils.make_categorical(self.df, "doc") def rank_docs(data): return data.sort_values("gold", ascending=False).doc.values self.gold = self.df.groupby("topic_item").apply(rank_docs)
def setup(self, annodf, golddf, c_anno_uid=None, c_anno_item=None, c_anno_label=None, c_gold_item=None, c_gold_label=None): renamey = lambda y: self.label_colname if "label" in y else self.item_colname if "item" in y else self.uid_colname if "uid" in y else "_" localargs = locals() colrename = { localargs[k]: renamey(k) for k in localargs if "c_" in k and localargs[k] is not None } self.annodf = annodf[[ c_anno_uid or self.uid_colname, c_anno_item or self.item_colname, c_anno_label or self.label_colname ]] self.annodf = self.annodf.dropna().copy().rename(columns=colrename)[[ self.uid_colname, self.item_colname, self.label_colname ]] uiddict = utils.make_categorical(self.annodf, "uid") itemdict = utils.make_categorical(self.annodf, "item") golddf = golddf[[ c_gold_item or self.item_colname, c_gold_label or self.label_colname ]] golddf = golddf.rename(columns=colrename)[[ self.item_colname, self.label_colname ]] golddf = utils.translate_categorical(golddf, self.item_colname, itemdict) self.golddict = golddf.set_index("item").to_dict()[self.label_colname] self.produce_stan_data()
def setup(self): userIDs = [] itemIDs = [] labels = [] golds = [] hmmcrowds = [] majorityvotes = [] for row in self.rawdf.iterrows(): itemID = row[1]["docid"] data = row[1]["Participants"] gold = self.golddf[self.golddf["docid"] == itemID]["Participants"].values[0] gold = gold.get("MedicalStudent") if gold is None: continue agg = self.aggdf[self.aggdf["docid"] == itemID]["Participants"].values[0] for userID, label in data.items(): userIDs.append(userID) itemIDs.append(itemID) labels.append(label2tvr(label, default=[])) golds.append(label2tvr(gold)) hmmcrowds.append(agg["HMMCrowd"]) majorityvotes.append(agg["MajorityVote"]) df = pd.DataFrame({ "uid": userIDs, "itemID": itemIDs, "label": labels, "gold": golds, "HMMCrowd": hmmcrowds, "MajorityVote": majorityvotes }) df = df.sort_values("itemID") userIdDict = utils.make_categorical(df, "uid") itemIdDict = utils.make_categorical(df, "itemID") anno_df = df.copy() super().setup(anno_df, anno_df, c_gold_label="gold") mv_labels = { k: label2tvr(v) for k, v in dict( df.groupby("itemID").first()["MajorityVote"].dropna()).items() } hmm_labels = { k: label2tvr(v) for k, v in dict( df.groupby("itemID").first()["HMMCrowd"].dropna()).items() } self.register_baseline("Tokenwise MV", mv_labels) self.register_baseline("Crowd-HMM", hmm_labels)
def setup(self, annodf, golddf=None, c_anno_uid=None, c_anno_item=None, c_anno_label=None, c_gold_item=None, c_gold_label=None, merge_index=None): renamey = lambda y: self.label_colname if "label" in y else self.item_colname if "item" in y else self.uid_colname if "uid" in y else y localargs = locals() colrename = {localargs[k]:renamey(k) for k in localargs if "c_" in k and localargs[k] is not None} self.annodf = annodf[[c_anno_uid or self.uid_colname, c_anno_item or self.item_colname, c_anno_label or self.label_colname]] self.annodf = self.annodf.rename(columns=colrename)[[self.uid_colname, self.item_colname, self.label_colname]] if merge_index is not None: self.merge_index_colname = merge_index self.annodf[merge_index] = annodf[merge_index] self.annodf = self.annodf.dropna().copy() self.uiddict = utils.make_categorical(self.annodf, self.uid_colname) self.itemdict = utils.make_categorical(self.annodf, self.item_colname) if golddf is not None: golddf = golddf[[c_gold_item or self.item_colname, c_gold_label or self.label_colname]] golddf = golddf.rename(columns=colrename)[[self.item_colname, self.label_colname]] golddf = utils.translate_categorical(golddf, self.item_colname, self.itemdict) self.golddict = golddf.set_index(self.item_colname).to_dict()[self.label_colname] self.golddict = {k: v for k, v in self.golddict.items() if v is not None} self.produce_stan_data()
def __init__(self, rawdata_dir='data/coco/person_keypoints_train2017.json', max_items=500, minlabelsperitem=4): with open(rawdata_dir) as f: dataset = json.load(f) self.category_id_skeletons = { c["id"]: np.array(c["skeleton"]) - 1 for c in iter(dataset["categories"]) } img_label = {} for dataset_annotation in iter(dataset["annotations"]): v = img_label.setdefault(dataset_annotation["image_id"], []) v.append(dataset_annotation) img_label_minlen = { k: v for k, v in img_label.items() if len(v) >= minlabelsperitem } i = 0 rows = [] item = [] annotation = [] category = [] for dataset_annotations in iter(img_label_minlen.values()): for dataset_annotation in dataset_annotations: kp = np.reshape(dataset_annotation["keypoints"], (-1, 3)) kp = kp[kp[:, 2] > -90][:, :2] if len(kp) == 0: continue item.append(dataset_annotation["image_id"]) annotation.append(kp) category.append(dataset_annotation["category_id"]) i += 1 if i > max_items: break kp_df = pd.DataFrame({ "item": item, "gold": annotation, "category": category }) self.df = kp_df.groupby("item")["gold"].apply(list).reset_index() self.itemdict = utils.make_categorical(self.df, "item")
def __init__(self, rawdata_dir, max_items=10000): self.df = pd.read_csv(rawdata_dir, error_bad_lines=False, header=None, sep=" ", names=["img", "x", "y", "w", "h"]) self.df = self.df[:max_items] self.df["goldcoords"] = self.df.apply(lambda row: [row["x"], row["y"], row["x"] + row["w"], row["y"] + row["h"]], axis=1) self.img_lookup = utils.make_categorical(self.df, "img") self.gold = self.df.set_index("img")["goldcoords"]