def MergeImpl(input: List[Path], output: Path): implicitly("prog.logger").debug(f"Running MergeImpl({input=}, {output=})") scsv_dataframe = flist.SCSV_Dataset.Dataframe_From_Files(input) scsv_all = flist.SCSV_Dataset.From_Dataframe(scsv_dataframe) merged_rows = [ flist.Merged_Functionality(functionality) for functionality in scsv_all.get_functionalities() ] for scsv_row in scsv_all.get_rows(): merge_targets = [ merged for merged in merged_rows if merged.functionality == scsv_row["functionality"] ] if len(merge_targets) == 0: raise io.FlistException(f"no merge targets found for {scsv_row}") if len(merge_targets) > 1: raise io.FlistException( f"multiple merge targets found for {scsv_row}: {merge_targets}" ) for target in merge_targets: target.merge_with(scsv_row) mcsv_rows = [merged.to_MCSV() for merged in merged_rows] mcsv_all = flist.MCSV_Dataset.From_Rows(mcsv_rows) mcsv_all.write_csv(output)
def Dataframe_From_Files(files, columns): """ Based on a list of columns, return the dataframe that corresponds to the content of the given files """ if len(files) == 0: raise io.FlistException("empty csv file list") for file in files: if not Path(file).exists(): raise io.FlistException(f"file {file} does not exist") frames = [ CSV_Dataset.Dataframe_From_File(file, columns) for file in files ] return pandas.concat(frames)
def substituteFileToFile(infilePath, outfilePath, **kwargs): if not Path(infilePath).exists(): raise io.FlistException(f"{infilePath=} does not exist") if Path(infilePath).resolve().absolute() == Path( outfilePath).resolve().absolute(): raise io.FlistException( f"no in-place substitution allowed -- {infilePath=} is the same as the output path" ) with open(outfilePath, "w") as outfile: with open(infilePath, "r") as infile: while replaced := infile.readline(): for k, v in kwargs.items(): replaced = replaced.replace("${" + k + "}", v) print(replaced, end='', file=outfile)
def translate(translation_df: pandas.DataFrame, lang: str, category_en: str): for i, row in translation_df.iterrows(): if row["en"] == category_en: return row[lang] raise io.FlistException( f"could not determine correct translation for category {category_en}; check that all categories in raw input files and all dynamically-attributed categories are maintained in the file {get_category_file()}" )
def Dataframe_From_File(file: Path, columns): """ This attempts to convert a file to a dataframe, based on a list of column names given """ if not file.exists(): raise io.FlistException(f"nonexistent file path: {file}") return pandas.read_csv(file, quoting=csv.QUOTE_NONE, sep=CSV_SEP, header=None, names=columns, keep_default_na=False)
def substituteFileToStr(infilePath, **kwargs): if not Path(infilePath).exists(): raise io.FlistException(f"{infilePath=} does not exist") builder = "" with open(infilePath, "r") as infile: while replaced := infile.readline(): for k, v in kwargs.items(): # if k == "odd_even": # print(f"orig: {replaced}") replaced = replaced.replace("${" + k + "}", v) # if k == "odd_even": # print(f"replaced with {v=}: {replaced}") builder = builder + replaced
def load_legacy_and_infer_ids(tool, language) -> flist.SCSV_Dataset: file = scsv_dump_file(tool, language) referenceFile = scsv_reference_file(tool) dataset = flist.SCSV_Dataset.From_Dataframe( flist.SCSV_Dataset.Dataframe_From_Files([file])) dataset_reference = flist.SCSV_Dataset.From_Dataframe( flist.SCSV_Dataset.Dataframe_From_Files([referenceFile])) if len(dataset_reference.rows) != len(dataset.rows): raise io.FlistException( f"could not extract legacy categories for {file} automatically: reference file {referenceFile} has a mismatching number of entries." ) entryIdx = 0 for referenceEntry in dataset_reference.rows: # print(f"dbg: inferring {referenceEntry}") referenceEntry.infer_id_from_fields(tool) dataset.rows[entryIdx].id = referenceEntry.id entryIdx += 1 return dataset
def getFileContent(path): if not Path(path).is_file: raise io.FlistException(f"not a file: {path}") with open(path, "r") as opened: return "".join(opened.readlines())
def Map_Columns(colname: str, translationfile: Path, input: Path, mapfile: Path, language: str, feedbackfile: Path, output: Path, id_diff=False): if not input.is_file(): raise io.FlistException(f"file {input} does not exist") if not mapfile.is_file(): raise io.FlistException(f"file {mapfile} does not exist") translations = pandas.read_csv(translationfile, quoting=csv.QUOTE_NONE, sep=";") feedbackfile_static = Path(__file__).parent.parent / "ws-static" / feedbackfile.relative_to( implicitly("workspace").path ) dataset = flist.SCSV_Dataset.From_Dataframe(flist.SCSV_Dataset.Dataframe_From_Files([input])) catmapping_all = pandas.DataFrame(columns = ["id", colname]) df_cats_input = pandas.read_csv(mapfile, quoting=csv.QUOTE_NONE, names=["id", colname, "path"], header=None, sep=";") df_cats_input.fillna('', inplace=True) df_cats_input = df_cats_input[ (~ df_cats_input["id"].str.contains(blank_disappears)) & (~ df_cats_input[colname].str.contains(blank_placeholder)) ] df_cats_feedback = pandas.DataFrame(columns = ["id", colname, "path"]) if feedbackfile.is_file(): df_cats_feedback = pandas.read_csv(feedbackfile, quoting=csv.QUOTE_NONE, names=["id", colname, "path"], header=None, sep=";") df_cats_feedback.fillna('', inplace=True) df_cats_feedback = df_cats_feedback[ (~ (df_cats_feedback["id"].str.contains(blank_disappears))) & (~ (df_cats_feedback[colname].str.contains(blank_placeholder))) ] implicitly("prog.logger").debug(f"==== Feedback cat file file ===") implicitly("prog.logger").debug(df_cats_feedback.to_string()) # implicitly("prog.logger").debug(df_cats_feedback.to_string()) # print(df_cats_input.to_string()) df_writeback_input = pandas.DataFrame(columns = ["id", colname, "path"]) df_writeback_feedback = pandas.DataFrame(columns = ["id", colname, "path"]) df_writeback_feedback_matched = pandas.DataFrame(columns = ["id", colname, "path"]) df_writeback_feedback_missing = pandas.DataFrame(columns = ["id", colname, "path"]) has_unmatched_categories = False implicitly("prog.logger").debug(f"{feedbackfile=}") catFromFeedbackCounter = 0 catFromInputFileCounter = 0 untranslatedCounter = 0 # print(df_cats_input.to_string()) for entry in dataset.rows: id = entry.id category = entry.category # if category != flist.SCSV_Entry.dynamic_category_notset(): # TODO: configurable? # raise io.FlistException(f"SCSV file {input} is being assigned categories dynamically, but has {colname} {category} which is not the 'needs {colname} assignment' placeholder that was expected") category_from_input = map_column(colname, df_cats_input, id) category_from_feedback = map_column(colname, df_cats_feedback, id) if id_diff: id_diff_and_map_column(colname, df_cats_feedback, id, entry) implicitly("prog.logger").debug(f"getting cat for {id}") implicitly("prog.logger").debug(f"{category_from_input=}, {category_from_feedback=}") if category_from_input: df_writeback_input = df_writeback_input.append([{"id":entry.id, colname:category_from_input}]) catFromInputFileCounter += 1 elif category_from_feedback: df_writeback_feedback_matched = df_writeback_feedback_matched.append([{"id": entry.id, colname: category_from_feedback, "path": entry.to_dataframe_dictionary()["path"]}]) catFromFeedbackCounter += 1 if category_from_input or category_from_feedback: mapped_cat = category_from_input or category_from_feedback translated_mapped_cat = translate(translationfile, translations, language, mapped_cat) if not translated_mapped_cat: if not language == "en": untranslatedCounter += 1 translated_mapped_cat = mapped_cat setattr(entry, colname, translated_mapped_cat) else: df_writeback_feedback_missing = df_writeback_feedback_missing.append([{"id": entry.id, colname: blank_placeholder, "path": entry.to_dataframe_dictionary()["path"]}]) has_unmatched_categories = True df_writeback_feedback = df_writeback_feedback.append(df_writeback_feedback_matched) df_writeback_feedback = df_writeback_feedback.append(df_writeback_feedback_missing) implicitly("prog.logger").info(f"{catFromInputFileCounter} of {len(dataset.rows)} entries have been assigned {colname}s based on their ids in {mapfile}") implicitly("prog.logger").info(f"{catFromFeedbackCounter} of {len(dataset.rows)} entries have been assigned {colname}s based on their ids in {feedbackfile}") implicitly("prog.logger").info(f"{(catFromFeedbackCounter+catFromInputFileCounter)-untranslatedCounter} of {catFromFeedbackCounter+catFromInputFileCounter} {colname}s have direct translations in {translationfile}, the rest inherits the english version") if catFromFeedbackCounter > 0: implicitly("prog.logger").info(f"[ NOTE ] when running the program with --initws, the file {feedbackfile} will be overwritten by the prototypical workspace and the manual entries may be lost. Consider merging them with the corresponding file of {input} in ./ws-static!") if has_unmatched_categories: api.implicitly("prog.logger").warning(f"[[ WARNING ]] : {len(df_writeback_feedback_missing)} entries in {input} could not be assigned {colname}s. To remedy this, edit the {colname}s manually in {feedbackfile}") # implicitly("prog.logger").info(f"writing feedbackfile with ids to {feedbackfile} with ids: {df_writeback_feedback['id'].tolist()}") df_writeback_feedback.to_csv(feedbackfile, quoting=csv.QUOTE_NONE, sep=flist.CSV_SEP, index=False, header=False) dataset.write_csv(output)
def PreprocessCT2(input: Path, output: Path, language: str): """ preprocess csv generated by CT2 according to the findings in doc/ct2-generated-csv/incongruence.md """ # read in if not input or not input.exists(): raise io.FlistException(f"input file {input} does not exist") if input == output: raise io.FlistException(f"{input=} cannot be equal to {output=}") if language == "en": # delete: # 1a) # SIGABA Known Plaintext;C; # ;[C];Tools\\ Misc\\ SIGABA Known Plaintext # 1b) # Ciphertext-only;W; # [...] # ;[W];Cryptanalysis\\ Modern Encryption\\ Symmetric Encryption\\ DES\\ Ciphertext-only with open(input, "r") as inputreader: lines = inputreader.readlines() with open(output, "w") as outputwriter: encountered1a = False encountered1b = False fixed1a = None fixed1b = None for line in lines: # Filter 1a) block if "SIGABA Known Plaintext;C;" in line: encountered1a = True continue if encountered1a and len(line.strip()) == 0: encountered1a = False # outputwriter.write(line) continue if encountered1a: fixed1a = True continue # Filter 1a) block if "Ciphertext-only;W;" in line: encountered1b = True outputwriter.write(line) continue if encountered1b and len(line.strip()) == 0: encountered1b = False outputwriter.write(line) continue if encountered1b and ";[W];Cryptanalysis\\ Modern Encryption\\ Symmetric Encryption\\ DES\\ Ciphertext-only" in line: fixed1b = True continue if encountered1b: outputwriter.write(line) continue outputwriter.write(line) if fixed1a is None: io.msg( f"[[ WARNING ]] when preprocessing CT2 files, did not encounter special case 1a as described in doc/ct2-generated-csv/incongruence.md . This may be a non-issue, though." ) if fixed1b is None: io.msg( f"[[ WARNING ]] when preprocessing CT2 files, did not encounter special case 1b as described in doc/ct2-generated-csv/incongruence.md . This may be a non-issue, though." ) elif language == "de": with open(input, "r") as inputreader: lines = inputreader.readlines() with open(output, "w") as outputwriter: encountered2a = None fixed2a = None for line in lines: # Filter 2a) block if "Ciphertext-only-Analyse;W;" in line: encountered2a = True continue if encountered2a and len(line.strip()) == 0: encountered2a = False # outputwriter.write(line) continue if encountered2a: fixed2a = True continue outputwriter.write(line) if fixed2a is None: io.msg( f"[[ WARNING ]] when preprocessing CT2 files, did not encounter special case 2a as described in doc/ct2-generated-csv/incongruence.md . This may be a non-issue, though." ) # 2a) # Ciphertext-only-Analyse;W; # ;[W];Kryptoanalyse\\ Moderne Verschlüsselung\\ Symmetrische Verschlüsselung\\ DES\\ Ciphertext-only-Analyse else: raise io.FlistException(f"unknown {language=}") # write out return
def merge_ids(ids): uniqd = uniq(ids) appended = MCSV_Entry.SEP_ids.join(uniqd) if (ids != uniqd): raise io.FlistException(f"ids are not unique: {ids=} != {uniqd=}") return appended