Esempio n. 1
0
 def get_data(data: pd.Dataframe) -> pd.Dataframe:
     with zipfile.ZipFile(countpath) as z:
         data["text"] = data.apply(
             lambda row: [
                 s.strip().decode("utf-8").split("\t")
                 for s in z.open(row.path, "r").readlines()
             ],
             axis=1,
         )
     data = data.explode("text")
     data[["word", "count"]] = data["text"].tolist()
     data.drop(columns=["text", "path"], inplace=True)
     data["count"] = data["count"].astype(int)
     if filter is not None:
         data = data[data["word"].map(filter)]
     return data
def remove_dissemination_id_changes(dataframe: pd.Dataframe):
    """Drops rows in pandas.DataFrame with updated DISSEMINATION_ID information"""
    n_corrections = len(dataframe[dataframe['ACTION'] == 'CORRECT'])
    n_cancels = len(dataframe[dataframe['ACTION'] == 'CANCEL'])
    to_drop = []
    print(f'There have been {n_cancels} cancels and '
          f'{n_corrections} corrections in dissemination IDs')
    for row_idx, row in dataframe.iterrows():
        if row['ACTION'] in ['CORRECT', 'CANCEL']:
            o_id = row['ORIGINAL_DISSEMINATION_ID']
            o_id = int(o_id)
            if o_id in dataframe.index:
                to_drop.append(o_id)
    if len(to_drop) > 0:
        dataframe = dataframe.drop(to_drop, axis=0)
    return dataframe
def substitude_row(dataset: Dataframe, repo_name: str,
                   new_row: List[str]) -> None:
    dataset.drop(labels=dataset[dataset['repo_name'] == repo_name].index,
                 inplace=True)
    dataset = dataset.append(other=new_row)