def get_dialects(data, encoding): delims = WRANGLER_DELIMS quotechars = get_potential_quotechars(data) escapechars = {} for delim in delims: delim_escapes = set() for u, v in pairwise(data): if v == delim and is_potential_escapechar(u, encoding): delim_escapes.add(u) for quotechar in quotechars: escapes = set(delim_escapes) for u, v in pairwise(data): if v == quotechar and is_potential_escapechar(u, encoding): escapes.add(u) escapes.add("") escapechars[(delim, quotechar)] = escapes dialects = [] for delim in delims: for quotechar in quotechars: for escapechar in escapechars[(delim, quotechar)]: d = Dialect(delim, quotechar, escapechar) dialects.append(d) return dialects
def maybe_has_escapechar(data, encoding, delim, quotechar): if not delim in data and not quotechar in data: return False for u, v in pairwise(data): if v in [delim, quotechar] and is_potential_escapechar(u, encoding): return True return False
def get_escapechar_options(data, encoding, delim, quotechar): escapes = set() for u, v in pairwise(data): if not is_potential_escapechar(u, encoding): continue if v in [delim, quotechar] and not u in [delim, quotechar]: escapes.add(u) return escapes
def break_ties_four(data, dialects): # NOTE: We have only observed one case during development where this # function was needed. It may need to be revisited in the future if other # examples are found. equal_delim = len(set([d.delimiter for d in dialects])) == 1 if not equal_delim: return None # First, identify dialects that result in the same parsing result. equal_dialects = [] for a, b in pairwise(dialects): X = parse_file(data, a) Y = parse_file(data, b) if X == Y: equal_dialects.append((a, b)) # Try to break the ties in these pairs new_dialects = set() visited = set() for A, B in equal_dialects: ans = break_ties_two(data, A, B) if not ans is None: new_dialects.add(ans) visited.add(A) visited.add(B) for d in dialects: if not d in visited: new_dialects.add(d) dialects = list(new_dialects) # Defer to other functions if the number of dialects was reduced if len(dialects) == 2: return break_ties_two(data, *dialects) elif len(dialects) == 3: return break_ties_three(data, *dialects) return None
def get_potential_dialects(data, encoding): """ We consider as escape characters those characters for which is_potential_escapechar() is True and that occur at least once before a quote character or delimiter in the dialect. One may wonder if self-escaping is an issue here (i.e. "\\\\", two times backslash). It is not. In a file where a single backslash is desired and escaping with a backslash is used, then it only makes sense to do this in a file where the backslash is already used as an escape character (in which case we include it). If it is never used as escape for the delimiter or quotechar, then it is not necessary to self-escape. """ delims = get_potential_delimiters(data, encoding) quotechars = get_potential_quotechars(data) escapechars = {} for delim, quotechar in itertools.product(delims, quotechars): escapechars[(delim, quotechar)] = set([""]) for u, v in pairwise(data): if not is_potential_escapechar(u, encoding): continue for delim, quotechar in itertools.product(delims, quotechars): if v == delim or v == quotechar: escapechars[(delim, quotechar)].add(u) dialects = [] for delim in delims: for quotechar in quotechars: for escapechar in escapechars[(delim, quotechar)]: if masked_by_quotechar(data, quotechar, escapechar, delim): continue d = Dialect(delim, quotechar, escapechar) dialects.append(d) return dialects
def break_ties_two(data, A, B): """ Break ties between dialects A and B. """ if A.delimiter == B.delimiter and A.escapechar == B.escapechar: if A.quotechar == "" or B.quotechar == "": d_no = A if A.quotechar == "" else B d_yes = B if d_no == A else A X = parse_file(data, dialect=d_no) Y = parse_file(data, dialect=d_yes) if X == Y: # quotechar has no effect return d_no else: # quotechar has an effect return d_yes elif A.quotechar == B.quotechar and A.escapechar == B.escapechar: if sorted([A.delimiter, B.delimiter]) == sorted([",", " "]): # Artifact due to type detection (comma as radix point) if A.delimiter == ",": return A else: return B elif A.delimiter == "-" or B.delimiter == "-": # Artifact due to type detection (dash as minus sign) if A.delimiter == "-": return B else: return A elif A.delimiter == B.delimiter and A.quotechar == B.quotechar: Dnone, Descape = (A, B) if A.escapechar == "" else (B, A) X = parse_file(data, Dnone) Y = parse_file(data, Descape) # double check shape. Usually if the shape differs the pattern score # should have caught it, but if by a freakish occurance it hasn't then # we can't break this tie (for now) if len(X) != len(Y): return None for x, y in zip(X, Y): if len(x) != len(y): return None cells_escaped = [] cells_unescaped = [] for x, y in zip(X, Y): for u, v in zip(x, y): if u != v: cells_unescaped.append(u) cells_escaped.append(v) # We will break the ties in the following ways: # # If the escapechar precedes the quotechar an even number of times # within each offending cell, then we think it is a functional escape # and the escaped version is the correct dialect. Note that if an odd # number of escaped quotechars would occur, then the shape of the file # will be different if it is ignored. Only if it occurs an even number # of times within the cell can we get the same shape. for u in cells_unescaped: count = 0 for a, b in pairwise(u): if a != Descape.escapechar: continue if a == Descape.escapechar and b == Descape.quotechar: count += 1 if count > 0 and count % 2 == 0: return Descape else: return Dnone return None