Exemple #1
0
def apply_map(map_path, data_path, out_file):
    """Apply a JSON mapping to data, and write the output.

    Args:
      map_path (str): Path to mapping file
      data_path (str): Path to data file
      out_file (file): output stream
    Return:
      None
    """
    map_file = open(map_path, 'r')
    mapping = mapper.Mapping(map_file, encoding='latin_1')
    data_file = open(data_path, 'rU')
    data_csv = csv.reader(data_file)
    # map each field
    d = {}
    input_fields = data_csv.next()
    matched, nomatch = mapping.apply(input_fields)
    for field, m in matched.items():
        d[field] = m.as_json()
        print('Mapped {} => {}'.format(field, m.field))
    for field in nomatch:
        print('* No mapping found for input field: {}'.format(field))
        d[field] = mapper.MapItem(field, None).as_json()
    # write mapping as a JSON
    try:
        json.dump(d, out_file, ensure_ascii=True)
    except BaseException:
        # print('** Error: While writing:\n{}'.format(d))
        pass
    # write stats
    print('Mapped {} fields: {} OK and {} did not match'.format(
        len(input_fields), len(matched), len(nomatch)))
Exemple #2
0
def find_duplicates(map_path, data_path, out_file):
    """Find duplicates created by a given mapping on a given input file.

    Args:
      map_path (str): Path to mapping file
      data_path (str): Path to data file
      out_file (file): output stream
    Return:
      None
    """
    map_file = open(map_path, "r")
    mapping = mapper.Mapping(map_file, encoding='latin-1')
    data_file = open(data_path, "rU")
    data_csv = csv.reader(data_file)
    hdr = data_csv.next()
    seen_values, dup = {}, {}
    for src in hdr:
        value = mapping.get(src, None)
        if value is None:
            continue
        dst = value.field
        if dst in seen_values:  # this is a duplicate
            if src in dup:  # we already have >= 1 duplicates
                # add new duplicate to list
                dup[dst].append(src)
            else:  # first duplicate
                # add both keys to list
                seen_key = seen_values[dst]
                dup[dst] = [seen_key, src]
        else:
            seen_values[dst] = src
    # print results
    for value, keys in dup.items():
        keylist = ' | '.join(keys)
        out_file.write(
            "({n:d}) {v}: {kl}\n".format(
                n=len(keys),
                v=value,
                kl=keylist,
            ),
        )