def determine_dqr(filename, verbose=False): encoding = get_encoding(filename) data = load_file(filename, encoding=encoding) if data is None: return DetectorResult(status=Status.SKIP, status_msg=StatusMsg.UNREADABLE) dialects = get_dialects(data, encoding) scores = [] for dialect in sorted(dialects): S = compute_suitability(data, dialect) if verbose: print("%15r\tsuitability = %.6f" % (dialect, S)) scores.append((S, dialect)) min_suit = min((x[0] for x in scores)) min_dialects = [x[1] for x in scores if x[0] == min_suit] if len(min_dialects) > 1: res = break_ties(data, min_dialects) else: res = min_dialects[0] if res is None: return DetectorResult(status=Status.FAIL, status_msg=StatusMsg.MULTIPLE_ANSWERS) res = DetectorResult(dialect=res, status=Status.OK) return res
def determine_dqr(filename, score_func, verbose=False, do_break_ties=True): encoding = get_encoding(filename) data = load_file(filename, encoding=encoding) if data is None: return DetectorResult(status=Status.SKIP, status_msg=StatusMsg.UNREADABLE) # fix-up to replace urls by a character, this removes many potential # delimiters that only occur in urls and cause noise. dialects = get_potential_dialects(filter_urls(data), encoding) if not dialects: return DetectorResult(status=Status.FAIL, status_msg=StatusMsg.NO_DIALECTS) if verbose: print("Length of data: %i\n" "Considering %i dialects\n" % (len(data), len(dialects))) scores = score_func(data, dialects, verbose=verbose) score_sort = sorted( [(scores[dialect], dialect) for dialect in scores], key=lambda x: x[0], reverse=True, ) max_prob = score_sort[0][0] dialects_with_score = [x[1] for x in score_sort if x[0] == max_prob] if len(dialects_with_score) > 1: if do_break_ties: res = break_ties(data, dialects_with_score) else: res = None else: res = dialects_with_score[0] if res is None: if verbose: print("More than 1 parameter set!") for d in dialects_with_score: print(d) return DetectorResult(status=Status.FAIL, status_msg=StatusMsg.MULTIPLE_ANSWERS) res = DetectorResult(dialect=res, status=Status.OK) return res
def batch_process(path_file, output_file): with open(path_file, "r") as fid: files = [l.strip() for l in fid.readlines()] files.sort() previous = load_previous(output_file) done = [x for x in files if x in previous and "dialect" in previous[x]] skipped = [ x for x in files if x in previous and previous[x]["status"] == "SKIP" ] todo = [x for x in files if not (x in done or x in skipped)] if not todo: print("All done.") return print("Number of files remaining: %i" % len(todo)) less_pane = init_tmux() count = 0 start_time = time.time() for filename in todo: old_res = previous.get(filename, None) if not os.path.exists(filename): print("File not found: %s" % filename) res = DetectorResult(status=Status.SKIP, status_msg=StatusMsg.NON_EXISTENT) continue res = annotate_file(filename, less_pane, old_res) res.filename = filename dump_result(output_file, res) count += 1 if count % 10 == 0: print("\nProgress: %i done out of %i. " "This session: %i (%.2f seconds per file)" % ( count, len(todo), count, ((time.time() - start_time) / count), )) print("All done.")
def main( path_file, output_file, determine_dqr=None, detector=None, verbose=False, progress=False, ): with open(path_file, "r") as fid: files = [l.strip() for l in fid.readlines()] files.sort() previous = load_previous(output_file) for filename in tqdm(files, disable=not progress, desc=detector): if filename in previous: continue if not os.path.exists(filename): res = DetectorResult( detector=detector, dialect=None, filename=filename, runtime=None, status=Status.FAIL, status_msg=StatusMsg.NON_EXISTENT, ) dump_result(output_file, res) continue if not progress: print("[%s] Analyzing file: %s" % (detector, filename)) start_time = time.time() try: res = determine_dqr(filename, verbose=verbose) except KeyboardInterrupt: raise except: print("Uncaught exception occured parsing file: %s" % filename) raise res.runtime = time.time() - start_time res.filename = filename res.detector = detector dump_result(output_file, res)
def main(normal_file, output_file): normals = load_normals(normal_file) results = {} for entry in tqdm(normals): filename = entry["filename"] form_id = entry["form_id"] params = entry["params"] if form_id == "FAIL": # unreadable file dr = DetectorResult( detector="normal", filename=filename, status=Status.FAIL, status_msg=StatusMsg.UNREADABLE, ) else: dialect = Dialect( delimiter=params["delim"], quotechar=params["quotechar"], escapechar=params["escapechar"], ) dr = DetectorResult( detector="normal", dialect=dialect, filename=filename, status=Status.OK, ) if filename in results: raise KeyError("Filename %s already exists, duplicate!" % filename) results[filename] = dr with open(output_file, "w") as fid: for filename in sorted(results.keys()): fid.write(results[filename].to_json() + "\n") print("All done.")
def determine_dqr(filename, verbose=False): """ Run the python CSV Sniffer """ encoding = get_encoding(filename) data = load_file(filename, encoding=encoding) if data is None: return DetectorResult(status=Status.SKIP, status_msg=StatusMsg.UNREADABLE) try: dialect = sniff(data) except csv.Error: return DetectorResult(status=Status.FAIL, status_msg=StatusMsg.NO_RESULTS) config = { "delimiter": dialect.delimiter, "quotechar": dialect.quotechar, "escapechar": dialect.escapechar, } res = DetectorResult(dialect=Dialect.from_dict(config), status=Status.OK) return res
def main(output_file, input_files): combined = {} for filename in input_files: with open(filename, "r") as fid: for line in fid: dr = DetectorResult.from_json(line.strip()) if dr.filename in combined: if dr.dialect == combined[dr.filename].dialect: # allow it if the dialect is the same continue else: raise KeyError("Duplicate result for file: %s" % dr.filename) combined[dr.filename] = dr with open(output_file, "w") as fid: for filename in sorted(combined.keys()): dr = combined[filename] dr.original_detector = dr.detector dr.detector = "reference" fid.write(dr.to_json() + "\n")
def annotate_file(filename, less_pane, previous): print("") encoding = get_encoding(filename) data = load_file(filename, encoding=encoding) if previous: ask_delim = not "delimiter" in previous ask_quotechar = not "quotechar" in previous and has_quotechar(data) ask_escapechar = not "escapechar" in previous else: ask_delim = True ask_quotechar = has_quotechar(data) ask_escapechar = True print("Annotating file: %s" % filename) res, note = ask_dqe( filename, data, encoding, ask_delim, ask_quotechar, ask_escapechar, previous, less_pane, ) out = DetectorResult(detector="human", filename=filename, runtime=None, status=Status.OK) if note: out.note = note if res is None: less_pane.send_keys("q") less_pane.clear() out.status = Status.SKIP out.status_msg = StatusMsg.HUMAN_SKIP return out if res["delimiter"] is None: res["delimiter"] = "" if res["quotechar"] is None: res["quotechar"] = "" out.dialect = Dialect.from_dict(res) return out
def load_detector_results(result_file): """ Load the results from a given detector result file. Verify each record in the process. """ detector_names = set() results = {} with open(result_file, "r") as fid: for idx, line in enumerate(fid.readlines()): try: record = DetectorResult.from_json(line.strip()) except json.JSONDecodeError: print( "\nError parsing the following record in file (line %i): " "%s\n---\n%s" % (idx + 1, result_file, line.strip())) raise SystemExit(1) detector_names.add(record.detector) fname = record.filename if not os.path.isabs(fname): fname = os.path.abspath(fname) record.filename = fname if fname in results: raise ValueError( "Duplicate result for file %s in detector file %s" % (record.filename, result_file)) record.validate() results[fname] = record if len(detector_names) > 1: raise ValueError("More than one detector name in file: %s" % result_file) detector = detector_names.pop() return detector, results
def wrap_determine_dqr(filename, verbose=False): res = run_with_timeout((filename, ), {"verbose": verbose}, TIMEOUT) if res is None: return DetectorResult(status=Status.FAIL, status_msg=StatusMsg.TIMEOUT) return res