def MergeImpl(input: List[Path], output: Path):

    implicitly("prog.logger").debug(f"Running MergeImpl({input=}, {output=})")
    scsv_dataframe = flist.SCSV_Dataset.Dataframe_From_Files(input)
    scsv_all = flist.SCSV_Dataset.From_Dataframe(scsv_dataframe)

    merged_rows = [
        flist.Merged_Functionality(functionality)
        for functionality in scsv_all.get_functionalities()
    ]

    for scsv_row in scsv_all.get_rows():
        merge_targets = [
            merged for merged in merged_rows
            if merged.functionality == scsv_row["functionality"]
        ]
        if len(merge_targets) == 0:
            raise io.FlistException(f"no merge targets found for {scsv_row}")
        if len(merge_targets) > 1:
            raise io.FlistException(
                f"multiple merge targets found for {scsv_row}: {merge_targets}"
            )
        for target in merge_targets:
            target.merge_with(scsv_row)

    mcsv_rows = [merged.to_MCSV() for merged in merged_rows]
    mcsv_all = flist.MCSV_Dataset.From_Rows(mcsv_rows)
    mcsv_all.write_csv(output)
 def Dataframe_From_Files(files, columns):
     """
     Based on a list of columns, return the dataframe that corresponds to the content of the given files
     """
     if len(files) == 0: raise io.FlistException("empty csv file list")
     for file in files:
         if not Path(file).exists():
             raise io.FlistException(f"file {file} does not exist")
     frames = [
         CSV_Dataset.Dataframe_From_File(file, columns) for file in files
     ]
     return pandas.concat(frames)
def substituteFileToFile(infilePath, outfilePath, **kwargs):
    if not Path(infilePath).exists():
        raise io.FlistException(f"{infilePath=} does not exist")
    if Path(infilePath).resolve().absolute() == Path(
            outfilePath).resolve().absolute():
        raise io.FlistException(
            f"no in-place substitution allowed -- {infilePath=} is the same as the output path"
        )
    with open(outfilePath, "w") as outfile:
        with open(infilePath, "r") as infile:
            while replaced := infile.readline():
                for k, v in kwargs.items():
                    replaced = replaced.replace("${" + k + "}", v)
                print(replaced, end='', file=outfile)
Example #4
0
def translate(translation_df: pandas.DataFrame, lang: str, category_en: str):
    for i, row in translation_df.iterrows():
        if row["en"] == category_en:
            return row[lang]
    raise io.FlistException(
        f"could not determine correct translation for category {category_en}; check that all categories in raw input files and all dynamically-attributed categories are maintained in the file {get_category_file()}"
    )
 def Dataframe_From_File(file: Path, columns):
     """
     This attempts to convert a file to a dataframe, based on a list of column names given
     """
     if not file.exists():
         raise io.FlistException(f"nonexistent file path: {file}")
     return pandas.read_csv(file,
                            quoting=csv.QUOTE_NONE,
                            sep=CSV_SEP,
                            header=None,
                            names=columns,
                            keep_default_na=False)
def substituteFileToStr(infilePath, **kwargs):
    if not Path(infilePath).exists():
        raise io.FlistException(f"{infilePath=} does not exist")
    builder = ""
    with open(infilePath, "r") as infile:
        while replaced := infile.readline():
            for k, v in kwargs.items():
                # if k == "odd_even":
                #     print(f"orig: {replaced}")
                replaced = replaced.replace("${" + k + "}", v)
                # if k == "odd_even":
                #     print(f"replaced with {v=}: {replaced}")
            builder = builder + replaced
def load_legacy_and_infer_ids(tool, language) -> flist.SCSV_Dataset:
    file = scsv_dump_file(tool, language)
    referenceFile = scsv_reference_file(tool)

    dataset = flist.SCSV_Dataset.From_Dataframe(
        flist.SCSV_Dataset.Dataframe_From_Files([file]))
    dataset_reference = flist.SCSV_Dataset.From_Dataframe(
        flist.SCSV_Dataset.Dataframe_From_Files([referenceFile]))
    if len(dataset_reference.rows) != len(dataset.rows):
        raise io.FlistException(
            f"could not extract legacy categories for {file} automatically: reference file {referenceFile} has a mismatching number of entries."
        )

    entryIdx = 0
    for referenceEntry in dataset_reference.rows:
        # print(f"dbg: inferring {referenceEntry}")
        referenceEntry.infer_id_from_fields(tool)
        dataset.rows[entryIdx].id = referenceEntry.id
        entryIdx += 1

    return dataset
def getFileContent(path):
    if not Path(path).is_file:
        raise io.FlistException(f"not a file: {path}")
    with open(path, "r") as opened:
        return "".join(opened.readlines())
def Map_Columns(colname: str, translationfile: Path, input: Path, mapfile: Path, language: str, feedbackfile: Path, output: Path, id_diff=False):
    if not input.is_file():
        raise io.FlistException(f"file {input} does not exist")
    if not mapfile.is_file():
        raise io.FlistException(f"file {mapfile} does not exist")

    translations = pandas.read_csv(translationfile, quoting=csv.QUOTE_NONE, sep=";")
    feedbackfile_static = Path(__file__).parent.parent / "ws-static" / feedbackfile.relative_to( implicitly("workspace").path )

    dataset = flist.SCSV_Dataset.From_Dataframe(flist.SCSV_Dataset.Dataframe_From_Files([input]))

    catmapping_all = pandas.DataFrame(columns = ["id", colname])

    df_cats_input = pandas.read_csv(mapfile, quoting=csv.QUOTE_NONE, names=["id", colname, "path"], header=None, sep=";")
    df_cats_input.fillna('', inplace=True)
    df_cats_input = df_cats_input[
        (~ df_cats_input["id"].str.contains(blank_disappears)) & (~ df_cats_input[colname].str.contains(blank_placeholder))
    ]

    df_cats_feedback = pandas.DataFrame(columns = ["id", colname, "path"])
    if feedbackfile.is_file():
        df_cats_feedback = pandas.read_csv(feedbackfile, quoting=csv.QUOTE_NONE, names=["id", colname, "path"], header=None, sep=";")
        df_cats_feedback.fillna('', inplace=True)
        df_cats_feedback = df_cats_feedback[
            (~ (df_cats_feedback["id"].str.contains(blank_disappears))) & (~ (df_cats_feedback[colname].str.contains(blank_placeholder)))
        ]
        implicitly("prog.logger").debug(f"==== Feedback cat file file ===")
        implicitly("prog.logger").debug(df_cats_feedback.to_string())
        # implicitly("prog.logger").debug(df_cats_feedback.to_string())
        # print(df_cats_input.to_string())


    df_writeback_input = pandas.DataFrame(columns = ["id", colname, "path"])
    df_writeback_feedback = pandas.DataFrame(columns = ["id", colname, "path"])
    df_writeback_feedback_matched = pandas.DataFrame(columns = ["id", colname, "path"])
    df_writeback_feedback_missing = pandas.DataFrame(columns = ["id", colname, "path"])


    has_unmatched_categories = False
    implicitly("prog.logger").debug(f"{feedbackfile=}")
    catFromFeedbackCounter = 0
    catFromInputFileCounter = 0
    untranslatedCounter = 0

    # print(df_cats_input.to_string())
    for entry in dataset.rows:
        id = entry.id
        category = entry.category
        # if category != flist.SCSV_Entry.dynamic_category_notset(): # TODO: configurable?
        #     raise io.FlistException(f"SCSV file {input} is being assigned categories dynamically, but has {colname} {category} which is not the 'needs {colname} assignment' placeholder that was expected")

        category_from_input = map_column(colname, df_cats_input, id)
        category_from_feedback = map_column(colname, df_cats_feedback, id)
        if id_diff:
            id_diff_and_map_column(colname, df_cats_feedback, id, entry)
        implicitly("prog.logger").debug(f"getting cat for {id}")
        implicitly("prog.logger").debug(f"{category_from_input=}, {category_from_feedback=}")

        if category_from_input:
            df_writeback_input = df_writeback_input.append([{"id":entry.id, colname:category_from_input}])
            catFromInputFileCounter += 1
        elif category_from_feedback:
            df_writeback_feedback_matched = df_writeback_feedback_matched.append([{"id": entry.id, colname: category_from_feedback, "path": entry.to_dataframe_dictionary()["path"]}])
            catFromFeedbackCounter += 1

        if category_from_input or category_from_feedback:
            mapped_cat = category_from_input or category_from_feedback
            translated_mapped_cat = translate(translationfile, translations, language, mapped_cat)

            if not translated_mapped_cat:
                if not language == "en":
                    untranslatedCounter += 1
                translated_mapped_cat = mapped_cat

            setattr(entry, colname, translated_mapped_cat)
        else:
            df_writeback_feedback_missing = df_writeback_feedback_missing.append([{"id": entry.id, colname: blank_placeholder, "path": entry.to_dataframe_dictionary()["path"]}])
            has_unmatched_categories = True

    df_writeback_feedback = df_writeback_feedback.append(df_writeback_feedback_matched)
    df_writeback_feedback = df_writeback_feedback.append(df_writeback_feedback_missing)
    implicitly("prog.logger").info(f"{catFromInputFileCounter} of {len(dataset.rows)} entries have been assigned {colname}s based on their ids in {mapfile}")
    implicitly("prog.logger").info(f"{catFromFeedbackCounter} of {len(dataset.rows)} entries have been assigned {colname}s based on their ids in {feedbackfile}")
    implicitly("prog.logger").info(f"{(catFromFeedbackCounter+catFromInputFileCounter)-untranslatedCounter} of {catFromFeedbackCounter+catFromInputFileCounter} {colname}s have direct translations in {translationfile}, the rest inherits the english version")
    if catFromFeedbackCounter > 0:
        implicitly("prog.logger").info(f"[ NOTE ] when running the program with --initws, the file {feedbackfile} will be overwritten by the prototypical workspace and the manual entries may be lost. Consider merging them with the corresponding file of {input} in ./ws-static!")

    if has_unmatched_categories:
        api.implicitly("prog.logger").warning(f"[[ WARNING ]] : {len(df_writeback_feedback_missing)} entries in {input} could not be assigned {colname}s. To remedy this, edit the {colname}s manually in {feedbackfile}")


    # implicitly("prog.logger").info(f"writing feedbackfile with ids to {feedbackfile} with ids: {df_writeback_feedback['id'].tolist()}")
    df_writeback_feedback.to_csv(feedbackfile, quoting=csv.QUOTE_NONE, sep=flist.CSV_SEP, index=False, header=False)
    dataset.write_csv(output)
Example #10
0
def PreprocessCT2(input: Path, output: Path, language: str):
    """
    preprocess csv generated by CT2 according to the findings in doc/ct2-generated-csv/incongruence.md
    """
    # read in
    if not input or not input.exists():
        raise io.FlistException(f"input file {input} does not exist")
    if input == output:
        raise io.FlistException(f"{input=} cannot be equal to {output=}")

    if language == "en":
        # delete:
        # 1a)
        # SIGABA Known Plaintext;C;
        # ;[C];Tools\\ Misc\\ SIGABA Known Plaintext
        # 1b)
        # Ciphertext-only;W;
        # [...]
        # ;[W];Cryptanalysis\\ Modern Encryption\\ Symmetric Encryption\\ DES\\ Ciphertext-only
        with open(input, "r") as inputreader:
            lines = inputreader.readlines()
        with open(output, "w") as outputwriter:
            encountered1a = False
            encountered1b = False
            fixed1a = None
            fixed1b = None
            for line in lines:
                # Filter 1a) block
                if "SIGABA Known Plaintext;C;" in line:
                    encountered1a = True
                    continue
                if encountered1a and len(line.strip()) == 0:
                    encountered1a = False
                    # outputwriter.write(line)
                    continue
                if encountered1a:
                    fixed1a = True
                    continue

                # Filter 1a) block
                if "Ciphertext-only;W;" in line:
                    encountered1b = True
                    outputwriter.write(line)
                    continue
                if encountered1b and len(line.strip()) == 0:
                    encountered1b = False
                    outputwriter.write(line)
                    continue
                if encountered1b and ";[W];Cryptanalysis\\ Modern Encryption\\ Symmetric Encryption\\ DES\\ Ciphertext-only" in line:
                    fixed1b = True
                    continue
                if encountered1b:
                    outputwriter.write(line)
                    continue

                outputwriter.write(line)
            if fixed1a is None:
                io.msg(
                    f"[[ WARNING ]] when preprocessing CT2 files, did not encounter special case 1a as described in doc/ct2-generated-csv/incongruence.md . This may be a non-issue, though."
                )
            if fixed1b is None:
                io.msg(
                    f"[[ WARNING ]] when preprocessing CT2 files, did not encounter special case 1b as described in doc/ct2-generated-csv/incongruence.md . This may be a non-issue, though."
                )

    elif language == "de":
        with open(input, "r") as inputreader:
            lines = inputreader.readlines()
        with open(output, "w") as outputwriter:
            encountered2a = None
            fixed2a = None
            for line in lines:
                # Filter 2a) block
                if "Ciphertext-only-Analyse;W;" in line:
                    encountered2a = True
                    continue
                if encountered2a and len(line.strip()) == 0:
                    encountered2a = False
                    # outputwriter.write(line)
                    continue
                if encountered2a:
                    fixed2a = True
                    continue

                outputwriter.write(line)
        if fixed2a is None:
            io.msg(
                f"[[ WARNING ]] when preprocessing CT2 files, did not encounter special case 2a as described in doc/ct2-generated-csv/incongruence.md . This may be a non-issue, though."
            )
        # 2a)
        # Ciphertext-only-Analyse;W;
        # ;[W];Kryptoanalyse\\ Moderne Verschlüsselung\\ Symmetrische Verschlüsselung\\ DES\\ Ciphertext-only-Analyse
    else:
        raise io.FlistException(f"unknown {language=}")
    # write out
    return
Example #11
0
 def merge_ids(ids):
     uniqd = uniq(ids)
     appended = MCSV_Entry.SEP_ids.join(uniqd)
     if (ids != uniqd):
         raise io.FlistException(f"ids are not unique: {ids=} != {uniqd=}")
     return appended