Example #1
0
def get_headers(fn):
    csvline = Csvline()
    with open(fn, "r") as f:
        return csvline.parse(f.readline())
Example #2
0
              "dwc:genus",
              "dwc:verbatimLocality",
              "dwc:catalogNumber",
              "dwc:eventDate",
              "dwc:recordedBy"]
    #fields = ["dwc:occurrenceID"]
    #fields = ["dwc:waterBody"]
    #fields = headers


    out_dir = "out{0}_{1}".format(raw, recordset)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    sc = SparkContext(appName="UniqueCSVline")
    csvline = Csvline()

    # filter removes header line which is going to be unique
    records = sc.textFile(fn)
    first_line = records.take(1)[0]
    records = records.filter(lambda line: line != first_line)
    parsed = records.map(lambda x: csvline.parse(x.encode("utf8"), headers) )
    parsed.cache()

    # most fields have ":", some are URLs too in the raw data, make them usable
    # as a file name. 
    p = re.compile('[\W_]+')
    for field in fields:

        out_fn = "{0}/unique_{1}.csv".format(out_dir, p.sub("_", field))
        if os.path.exists(out_fn):