def get_data(data: pd.Dataframe) -> pd.Dataframe: with zipfile.ZipFile(countpath) as z: data["text"] = data.apply( lambda row: [ s.strip().decode("utf-8").split("\t") for s in z.open(row.path, "r").readlines() ], axis=1, ) data = data.explode("text") data[["word", "count"]] = data["text"].tolist() data.drop(columns=["text", "path"], inplace=True) data["count"] = data["count"].astype(int) if filter is not None: data = data[data["word"].map(filter)] return data
def remove_dissemination_id_changes(dataframe: pd.Dataframe): """Drops rows in pandas.DataFrame with updated DISSEMINATION_ID information""" n_corrections = len(dataframe[dataframe['ACTION'] == 'CORRECT']) n_cancels = len(dataframe[dataframe['ACTION'] == 'CANCEL']) to_drop = [] print(f'There have been {n_cancels} cancels and ' f'{n_corrections} corrections in dissemination IDs') for row_idx, row in dataframe.iterrows(): if row['ACTION'] in ['CORRECT', 'CANCEL']: o_id = row['ORIGINAL_DISSEMINATION_ID'] o_id = int(o_id) if o_id in dataframe.index: to_drop.append(o_id) if len(to_drop) > 0: dataframe = dataframe.drop(to_drop, axis=0) return dataframe
def substitude_row(dataset: Dataframe, repo_name: str, new_row: List[str]) -> None: dataset.drop(labels=dataset[dataset['repo_name'] == repo_name].index, inplace=True) dataset = dataset.append(other=new_row)