def getItems(config): configMeta = config["metadata"] inputFile = configMeta["src"] fieldnames, items = io.readCsv(inputFile, parseNumbers=False) if "query" in configMeta: items = lu.filterByQueryString(items, configMeta["query"]) print("%s items after filtering" % len(items)) # Add columns sets, items = addColumnsToItems(items, config) # Sort so that index corresponds to ID items = sorted(items, key=lambda item: item["id"]) items = lu.addIndices(items) return (sets, items)
help="File generated by html_to_csv.py") parser.add_argument('-field', dest="FIELD", default="Locale", help="Field to output") parser.add_argument( '-out', dest="OUTPUT_FILE", default="data/processed/MexicoAndCentralAmerica_locales.csv", help="Output csv file") a = parser.parse_args() # Make sure output dirs exist io.makeDirectories([a.OUTPUT_FILE]) fieldNames, items = io.readCsv(a.INPUT_FILE) itemCount = len(items) values = [item[a.FIELD] for item in items] counter = collections.Counter(values) counts = counter.most_common() rows = [] for value, count in counts: if len(str(value).strip()) < 1: continue row = {} row[a.FIELD] = value row["Count"] = count rows.append(row)
from pprint import pprint import random import sys import lib.io_utils as io import lib.math_utils as mu # input parser = argparse.ArgumentParser() parser.add_argument('-in', dest="INPUT_FILE", default="data/report_data.csv", help="File for input") parser.add_argument('-start', dest="START_YEAR", default=1869, type=int, help="Start year") parser.add_argument('-end', dest="END_YEAR", default=2018, type=int, help="End year") parser.add_argument('-out', dest="OUTPUT_FILE", default="data/collections.json", help="File for output") a = parser.parse_args() _, data = io.readCsv(a.INPUT_FILE) data = sorted(data, key=lambda k: k["year"]) latest = data[-1] divisions = ["invertebrate zoology", "paleontology", "vertibrate zoology", "anthropology", "physical sciences"] # entry = mu.roundInt(latest[]/1000.0) # drawDots("img/dots_circle_invertebrate.png", totalDots, dotW, highlightCount=entry) # runningTotal = entry # # entry = mu.roundInt(latest["paleontology"]/1000.0) # drawDots("img/dots_circle_paleontology.png", totalDots, dotW, highlightCount=entry, highlightColor=[217,64,107], highlightOffset=runningTotal) # runningTotal += entry # # entry = mu.roundInt(latest["vertibrate zoology"]/1000.0) # drawDots("img/dots_circle_vertibrate.png", totalDots, dotW, highlightCount=entry, highlightColor=[226,169,17], highlightOffset=runningTotal)
dest="FIELD_LIST", default="", help="Comma-separated list of fields to output; leave blank for everything" ) a = parser.parse_args() # Make sure output dirs exist io.makeDirectories([a.OUTPUT_FILE]) items = [] fieldNames = [] if "*" in a.INPUT_FILE: files = glob.glob(a.INPUT_FILE) for fn in files: fFieldNames, fItems = io.readCsv(fn, parseNumbers=False) # Infer region from filename for j, fitem in enumerate(fItems): fItems[j]["Region"] = re.sub(r'(?<!^)(?=[A-Z])', ' ', os.path.basename(fn).split(".")[0]) fieldNames += fFieldNames items += fItems fieldNames = lu.unique(fieldNames) else: fieldNames, items = io.readCsv(a.INPUT_FILE, parseNumbers=False) itemCount = len(items) # this is where the normalized data will go cleanedItems = [{
FIELDS_LISTS = a.FIELDS_LISTS.strip().split(",") FIELDS_MERGE = io.parseQueryString(a.FIELDS_MERGE.strip()) # Make sure output dirs exist io.makeDirectories([a.OUTPUT_FILE]) if len(a.DETAILED_OUTPUT_FILE) > 0: io.makeDirectories([a.DETAILED_OUTPUT_FILE]) items = [] fieldNames = [] if "*" in a.INPUT_FILE: files = glob.glob(a.INPUT_FILE) for fn in files: fFieldNames, fItems = io.readCsv(fn) fieldNames += fFieldNames items += fItems fieldNames = lu.unique(fieldNames) else: fieldNames, items = io.readCsv(a.INPUT_FILE) # make unique based on id items = list({item['Catalog No']: item for item in items}.values()) itemCount = len(items) # Parse lists for i, item in enumerate(items): for field in FIELDS_LISTS:
help="Max number of subjects (includes 'other')") parser.add_argument('-out', dest="OUTPUT_FILE", default="data/photographic_images.json", help="File for output") a = parser.parse_args() YEAR_RANGE = [1600, 2020] gridW, gridH = tuple([int(t) for t in a.GRID_SIZE.split("x")]) # Make sure output dirs exist io.makeDirectories(a.OUTPUT_FILE) # retrieve data fieldNames, data = io.readCsv(a.INPUT_FILE) dataCount = len(data) _, subjectData = io.readCsv(a.SUBJECTS_FILE) grid = np.loadtxt(a.GRID_FILE, delimiter=",") imageFiles = glob.glob(a.IMAGE_FILES) imageFiles = sorted(imageFiles) fileCount = len(imageFiles) print("Loaded %s files" % fileCount) # process subject data subjectData = groupList(subjectData, "subject", sort=True) subjectCount = len(subjectData) mainSubjects = subjectData[:a. MAX_SUBJECTS] if subjectCount > a.MAX_SUBJECTS else subjectData subjectLabels = [s["subject"] for s in mainSubjects]
parser.add_argument('-out', dest="OUTPUT_FILE", default="data/metadata.json", help="File for output") a = parser.parse_args() tileW, tileH = tuple([int(t) for t in a.TILE_SIZE.split("x")]) gridAssignment = None with open(a.INPUT_GRID_FILE, "rb") as f: gridAssignment = pickle.load(f) grid, gridShape = gridAssignment gridW, gridH = gridShape # Make sure output dirs exist io.makeDirectories(a.OUTPUT_FILE) # retrieve data fieldNames, data = io.readCsv(a.INPUT_FILE) dataCount = len(data) # retrieve images imageFiles = glob.glob(a.IMAGE_FILES) imageFiles = sorted(imageFiles) fileCount = len(imageFiles) print("Loaded %s files" % fileCount) if fileCount != len(grid): print("File count (%s) != grid count (%s)" % (fileCount, len(grid))) sys.exit() # initialize grid rows = [] for i in range(gridH):
import sys import lib.io_utils as io import lib.list_utils as lu import lib.math_utils as mu # input parser = argparse.ArgumentParser() parser.add_argument('-in', dest="INPUT_FILE", default="data/MexicoAndCentralAmerica.csv", help="File generated by html_to_csv.py") parser.add_argument('-out', dest="OUTPUT_FILE", default="data/processed/MexicoAndCentralAmerica_cleaned.csv", help="Output csv file") a = parser.parse_args() # Make sure output dirs exist io.makeDirectories([a.OUTPUT_FILE]) fieldNames, items = io.readCsv(a.INPUT_FILE) itemCount = len(items) # this is where the clean data will go cleanedItems = [{ "Id": item["Catalog No"], "Acquisition Year": "", "Acquisition Type": "", "Latitude": 0, "Longitude": 0, "Country": "", "Locale": "", "Category": "", "Hall": "" } for item in items]
def getItems(config): inputFile = config["metadataFile"] idCol = config["identifierColumn"] if "identifierColumn" in config else None fieldnames, items = io.readCsv(inputFile, parseNumbers=False) if "metadataFilterQuery" in config: items = lu.filterByQueryString(items, config["metadataFilterQuery"]) print("%s items after filtering" % len(items)) # map year, lat/lon, and category columnMap = [("dateColumn", "year"), ("latitudeColumn", "lat"), ("longitudeColumn", "lon"), ("countryColumn", "country"), ("groupByColumn", "category")] minimumYear = config["minimumYear"] if "minimumYear" in config else None maximumYear = config["maximumYear"] if "maximumYear" in config else None validItems = [] for i, item in enumerate(items): validatedItem = item.copy() isValid = True for configKey, toColumn in columnMap: if configKey not in config: continue value = item[config[configKey]] if toColumn == "year": value = su.validateYear(value, minimumYear, maximumYear) elif toColumn == "lat": value = su.validateLat(value) elif toColumn == "lon": value = su.validateLon(value) if value is None: isValid = False break validatedItem[toColumn] = value if isValid: validItems.append(validatedItem) diff = len(items) - len(validItems) print(f'Found {diff} invalid items.') # Sort so that index corresponds to ID if idCol is not None: for i, item in enumerate(validItems): validItems[i]["_id"] = str(item[idCol]) validItems = sorted(validItems, key=lambda item: item["_id"]) validItems = lu.addIndices(validItems) if idCol is None: for i, item in enumerate(validItems): validItems[i]["_id"] = str(i) # Retrieve categories categories = [] itemsByCategory = lu.groupList(validItems, "category", sort=True, desc=True) if "groupLimit" in config and len(itemsByCategory) > config["groupLimit"]: limit = config["groupLimit"] - 1 otherItems = itemsByCategory[limit:] otherLabel = config["otherLabel"] if "otherLabel" in config else "Other" otherCount = 0 for group in otherItems: for item in group["items"]: validItems[item["index"]]["category"] = otherLabel otherCount += 1 itemsByCategory = itemsByCategory[:limit] + [{ "category": otherLabel, "count": otherCount }] categoryColors = config["groupColors"] colorCount = len(categoryColors) for i, category in enumerate(itemsByCategory): color = categoryColors[i % colorCount] categories.append({ "text": category["category"], "color": color, "count": category["count"] }) return (validItems, categories)
dest="OUTPUT_FILE", default="data/processed/MexicoAndCentralAmerica_geocoded.csv", help="Output csv file") parser.add_argument('-wait', dest="WAIT_SECONDS", default=5, type=int, help="Seconds to wait before each request") a = parser.parse_args() LOCALE_FIELD = "Locale" # Make sure output dirs exist io.makeDirectories([a.OUTPUT_FILE, a.CACHE_FILE]) fieldNames, items = io.readCsv(a.INPUT_FILE) itemCount = len(items) for i, item in enumerate(items): items[i]["LookupString"] = "" # only lookup items that have country and locale with values if len(item["Country"]) < 1 or len(item[LOCALE_FIELD]) < 1: continue items[i]["LookupString"] = item[LOCALE_FIELD] + ", " + item["Country"] locales = [] localeLookup = {} if os.path.isfile(a.CACHE_FILE): _, locales = io.readCsv(a.CACHE_FILE)