コード例 #1
0
def determine_dqr(filename, verbose=False):
    encoding = get_encoding(filename)
    data = load_file(filename, encoding=encoding)
    if data is None:
        return DetectorResult(status=Status.SKIP,
                              status_msg=StatusMsg.UNREADABLE)

    dialects = get_dialects(data, encoding)
    scores = []

    for dialect in sorted(dialects):
        S = compute_suitability(data, dialect)
        if verbose:
            print("%15r\tsuitability = %.6f" % (dialect, S))
        scores.append((S, dialect))

    min_suit = min((x[0] for x in scores))
    min_dialects = [x[1] for x in scores if x[0] == min_suit]

    if len(min_dialects) > 1:
        res = break_ties(data, min_dialects)
    else:
        res = min_dialects[0]

    if res is None:
        return DetectorResult(status=Status.FAIL,
                              status_msg=StatusMsg.MULTIPLE_ANSWERS)

    res = DetectorResult(dialect=res, status=Status.OK)

    return res
コード例 #2
0
def determine_dqr(filename, score_func, verbose=False, do_break_ties=True):
    encoding = get_encoding(filename)
    data = load_file(filename, encoding=encoding)
    if data is None:
        return DetectorResult(status=Status.SKIP,
                              status_msg=StatusMsg.UNREADABLE)

    # fix-up to replace urls by a character, this removes many potential
    # delimiters that only occur in urls and cause noise.
    dialects = get_potential_dialects(filter_urls(data), encoding)
    if not dialects:
        return DetectorResult(status=Status.FAIL,
                              status_msg=StatusMsg.NO_DIALECTS)

    if verbose:
        print("Length of data: %i\n"
              "Considering %i dialects\n" % (len(data), len(dialects)))

    scores = score_func(data, dialects, verbose=verbose)

    score_sort = sorted(
        [(scores[dialect], dialect) for dialect in scores],
        key=lambda x: x[0],
        reverse=True,
    )

    max_prob = score_sort[0][0]
    dialects_with_score = [x[1] for x in score_sort if x[0] == max_prob]

    if len(dialects_with_score) > 1:
        if do_break_ties:
            res = break_ties(data, dialects_with_score)
        else:
            res = None
    else:
        res = dialects_with_score[0]

    if res is None:
        if verbose:
            print("More than 1 parameter set!")
            for d in dialects_with_score:
                print(d)
        return DetectorResult(status=Status.FAIL,
                              status_msg=StatusMsg.MULTIPLE_ANSWERS)

    res = DetectorResult(dialect=res, status=Status.OK)

    return res
コード例 #3
0
def batch_process(path_file, output_file):
    with open(path_file, "r") as fid:
        files = [l.strip() for l in fid.readlines()]
    files.sort()

    previous = load_previous(output_file)

    done = [x for x in files if x in previous and "dialect" in previous[x]]
    skipped = [
        x for x in files if x in previous and previous[x]["status"] == "SKIP"
    ]
    todo = [x for x in files if not (x in done or x in skipped)]

    if not todo:
        print("All done.")
        return

    print("Number of files remaining: %i" % len(todo))

    less_pane = init_tmux()

    count = 0
    start_time = time.time()
    for filename in todo:
        old_res = previous.get(filename, None)

        if not os.path.exists(filename):
            print("File not found: %s" % filename)
            res = DetectorResult(status=Status.SKIP,
                                 status_msg=StatusMsg.NON_EXISTENT)
            continue

        res = annotate_file(filename, less_pane, old_res)
        res.filename = filename
        dump_result(output_file, res)
        count += 1

        if count % 10 == 0:
            print("\nProgress: %i done out of %i. "
                  "This session: %i (%.2f seconds per file)" % (
                      count,
                      len(todo),
                      count,
                      ((time.time() - start_time) / count),
                  ))

    print("All done.")
コード例 #4
0
ファイル: core.py プロジェクト: zahradaryan123/CSV_Wrangling
def main(
    path_file,
    output_file,
    determine_dqr=None,
    detector=None,
    verbose=False,
    progress=False,
):
    with open(path_file, "r") as fid:
        files = [l.strip() for l in fid.readlines()]
    files.sort()

    previous = load_previous(output_file)

    for filename in tqdm(files, disable=not progress, desc=detector):
        if filename in previous:
            continue

        if not os.path.exists(filename):
            res = DetectorResult(
                detector=detector,
                dialect=None,
                filename=filename,
                runtime=None,
                status=Status.FAIL,
                status_msg=StatusMsg.NON_EXISTENT,
            )
            dump_result(output_file, res)
            continue

        if not progress:
            print("[%s] Analyzing file: %s" % (detector, filename))

        start_time = time.time()
        try:
            res = determine_dqr(filename, verbose=verbose)
        except KeyboardInterrupt:
            raise
        except:
            print("Uncaught exception occured parsing file: %s" % filename)
            raise

        res.runtime = time.time() - start_time
        res.filename = filename
        res.detector = detector
        dump_result(output_file, res)
コード例 #5
0
def main(normal_file, output_file):
    normals = load_normals(normal_file)

    results = {}
    for entry in tqdm(normals):
        filename = entry["filename"]
        form_id = entry["form_id"]
        params = entry["params"]

        if form_id == "FAIL":
            # unreadable file
            dr = DetectorResult(
                detector="normal",
                filename=filename,
                status=Status.FAIL,
                status_msg=StatusMsg.UNREADABLE,
            )
        else:
            dialect = Dialect(
                delimiter=params["delim"],
                quotechar=params["quotechar"],
                escapechar=params["escapechar"],
            )

            dr = DetectorResult(
                detector="normal",
                dialect=dialect,
                filename=filename,
                status=Status.OK,
            )

        if filename in results:
            raise KeyError("Filename %s already exists, duplicate!" % filename)

        results[filename] = dr

    with open(output_file, "w") as fid:
        for filename in sorted(results.keys()):
            fid.write(results[filename].to_json() + "\n")

    print("All done.")
コード例 #6
0
def determine_dqr(filename, verbose=False):
    """ Run the python CSV Sniffer """
    encoding = get_encoding(filename)
    data = load_file(filename, encoding=encoding)
    if data is None:
        return DetectorResult(status=Status.SKIP,
                              status_msg=StatusMsg.UNREADABLE)

    try:
        dialect = sniff(data)
    except csv.Error:
        return DetectorResult(status=Status.FAIL,
                              status_msg=StatusMsg.NO_RESULTS)

    config = {
        "delimiter": dialect.delimiter,
        "quotechar": dialect.quotechar,
        "escapechar": dialect.escapechar,
    }
    res = DetectorResult(dialect=Dialect.from_dict(config), status=Status.OK)

    return res
コード例 #7
0
def main(output_file, input_files):
    combined = {}
    for filename in input_files:
        with open(filename, "r") as fid:
            for line in fid:
                dr = DetectorResult.from_json(line.strip())
                if dr.filename in combined:
                    if dr.dialect == combined[dr.filename].dialect:
                        # allow it if the dialect is the same
                        continue
                    else:
                        raise KeyError("Duplicate result for file: %s" %
                                       dr.filename)
                combined[dr.filename] = dr

    with open(output_file, "w") as fid:
        for filename in sorted(combined.keys()):
            dr = combined[filename]
            dr.original_detector = dr.detector
            dr.detector = "reference"
            fid.write(dr.to_json() + "\n")
コード例 #8
0
def annotate_file(filename, less_pane, previous):
    print("")
    encoding = get_encoding(filename)
    data = load_file(filename, encoding=encoding)

    if previous:
        ask_delim = not "delimiter" in previous
        ask_quotechar = not "quotechar" in previous and has_quotechar(data)
        ask_escapechar = not "escapechar" in previous
    else:
        ask_delim = True
        ask_quotechar = has_quotechar(data)
        ask_escapechar = True

    print("Annotating file: %s" % filename)
    res, note = ask_dqe(
        filename,
        data,
        encoding,
        ask_delim,
        ask_quotechar,
        ask_escapechar,
        previous,
        less_pane,
    )

    out = DetectorResult(detector="human",
                         filename=filename,
                         runtime=None,
                         status=Status.OK)
    if note:
        out.note = note

    if res is None:
        less_pane.send_keys("q")
        less_pane.clear()
        out.status = Status.SKIP
        out.status_msg = StatusMsg.HUMAN_SKIP
        return out

    if res["delimiter"] is None:
        res["delimiter"] = ""
    if res["quotechar"] is None:
        res["quotechar"] = ""

    out.dialect = Dialect.from_dict(res)

    return out
コード例 #9
0
def load_detector_results(result_file):
    """
    Load the results from a given detector result file. Verify each record in 
    the process.
    """
    detector_names = set()
    results = {}
    with open(result_file, "r") as fid:
        for idx, line in enumerate(fid.readlines()):
            try:
                record = DetectorResult.from_json(line.strip())
            except json.JSONDecodeError:
                print(
                    "\nError parsing the following record in file (line %i): "
                    "%s\n---\n%s" % (idx + 1, result_file, line.strip()))
                raise SystemExit(1)

            detector_names.add(record.detector)

            fname = record.filename
            if not os.path.isabs(fname):
                fname = os.path.abspath(fname)
                record.filename = fname
            if fname in results:
                raise ValueError(
                    "Duplicate result for file %s in detector file %s" %
                    (record.filename, result_file))

            record.validate()
            results[fname] = record

    if len(detector_names) > 1:
        raise ValueError("More than one detector name in file: %s" %
                         result_file)
    detector = detector_names.pop()
    return detector, results
コード例 #10
0
def wrap_determine_dqr(filename, verbose=False):
    res = run_with_timeout((filename, ), {"verbose": verbose}, TIMEOUT)
    if res is None:
        return DetectorResult(status=Status.FAIL, status_msg=StatusMsg.TIMEOUT)
    return res