Example #1
0
def getItems(config):
    configMeta = config["metadata"]
    inputFile = configMeta["src"]

    fieldnames, items = io.readCsv(inputFile, parseNumbers=False)
    if "query" in configMeta:
        items = lu.filterByQueryString(items, configMeta["query"])
        print("%s items after filtering" % len(items))

    # Add columns
    sets, items = addColumnsToItems(items, config)

    # Sort so that index corresponds to ID
    items = sorted(items, key=lambda item: item["id"])
    items = lu.addIndices(items)

    return (sets, items)
Example #2
0
                    help="File generated by html_to_csv.py")
parser.add_argument('-field',
                    dest="FIELD",
                    default="Locale",
                    help="Field to output")
parser.add_argument(
    '-out',
    dest="OUTPUT_FILE",
    default="data/processed/MexicoAndCentralAmerica_locales.csv",
    help="Output csv file")
a = parser.parse_args()

# Make sure output dirs exist
io.makeDirectories([a.OUTPUT_FILE])

fieldNames, items = io.readCsv(a.INPUT_FILE)
itemCount = len(items)

values = [item[a.FIELD] for item in items]
counter = collections.Counter(values)
counts = counter.most_common()

rows = []
for value, count in counts:
    if len(str(value).strip()) < 1:
        continue
    row = {}
    row[a.FIELD] = value
    row["Count"] = count
    rows.append(row)
Example #3
0
from pprint import pprint
import random
import sys

import lib.io_utils as io
import lib.math_utils as mu

# input
parser = argparse.ArgumentParser()
parser.add_argument('-in', dest="INPUT_FILE", default="data/report_data.csv", help="File for input")
parser.add_argument('-start', dest="START_YEAR", default=1869, type=int, help="Start year")
parser.add_argument('-end', dest="END_YEAR", default=2018, type=int, help="End year")
parser.add_argument('-out', dest="OUTPUT_FILE", default="data/collections.json", help="File for output")
a = parser.parse_args()

_, data = io.readCsv(a.INPUT_FILE)
data = sorted(data, key=lambda k: k["year"])
latest = data[-1]

divisions = ["invertebrate zoology", "paleontology", "vertibrate zoology", "anthropology", "physical sciences"]

# entry = mu.roundInt(latest[]/1000.0)
# drawDots("img/dots_circle_invertebrate.png", totalDots, dotW, highlightCount=entry)
# runningTotal = entry
#
# entry = mu.roundInt(latest["paleontology"]/1000.0)
# drawDots("img/dots_circle_paleontology.png", totalDots, dotW, highlightCount=entry, highlightColor=[217,64,107], highlightOffset=runningTotal)
# runningTotal += entry
#
# entry = mu.roundInt(latest["vertibrate zoology"]/1000.0)
# drawDots("img/dots_circle_vertibrate.png", totalDots, dotW, highlightCount=entry, highlightColor=[226,169,17], highlightOffset=runningTotal)
    dest="FIELD_LIST",
    default="",
    help="Comma-separated list of fields to output; leave blank for everything"
)
a = parser.parse_args()

# Make sure output dirs exist
io.makeDirectories([a.OUTPUT_FILE])

items = []
fieldNames = []

if "*" in a.INPUT_FILE:
    files = glob.glob(a.INPUT_FILE)
    for fn in files:
        fFieldNames, fItems = io.readCsv(fn, parseNumbers=False)
        # Infer region from filename
        for j, fitem in enumerate(fItems):
            fItems[j]["Region"] = re.sub(r'(?<!^)(?=[A-Z])', ' ',
                                         os.path.basename(fn).split(".")[0])
        fieldNames += fFieldNames
        items += fItems
    fieldNames = lu.unique(fieldNames)

else:
    fieldNames, items = io.readCsv(a.INPUT_FILE, parseNumbers=False)

itemCount = len(items)

# this is where the normalized data will go
cleanedItems = [{
FIELDS_LISTS = a.FIELDS_LISTS.strip().split(",")
FIELDS_MERGE = io.parseQueryString(a.FIELDS_MERGE.strip())

# Make sure output dirs exist
io.makeDirectories([a.OUTPUT_FILE])

if len(a.DETAILED_OUTPUT_FILE) > 0:
    io.makeDirectories([a.DETAILED_OUTPUT_FILE])

items = []
fieldNames = []

if "*" in a.INPUT_FILE:
    files = glob.glob(a.INPUT_FILE)
    for fn in files:
        fFieldNames, fItems = io.readCsv(fn)
        fieldNames += fFieldNames
        items += fItems
    fieldNames = lu.unique(fieldNames)

else:
    fieldNames, items = io.readCsv(a.INPUT_FILE)

# make unique based on id
items = list({item['Catalog No']: item for item in items}.values())

itemCount = len(items)

# Parse lists
for i, item in enumerate(items):
    for field in FIELDS_LISTS:
Example #6
0
                    help="Max number of subjects (includes 'other')")
parser.add_argument('-out',
                    dest="OUTPUT_FILE",
                    default="data/photographic_images.json",
                    help="File for output")
a = parser.parse_args()

YEAR_RANGE = [1600, 2020]

gridW, gridH = tuple([int(t) for t in a.GRID_SIZE.split("x")])

# Make sure output dirs exist
io.makeDirectories(a.OUTPUT_FILE)

# retrieve data
fieldNames, data = io.readCsv(a.INPUT_FILE)
dataCount = len(data)
_, subjectData = io.readCsv(a.SUBJECTS_FILE)
grid = np.loadtxt(a.GRID_FILE, delimiter=",")
imageFiles = glob.glob(a.IMAGE_FILES)
imageFiles = sorted(imageFiles)
fileCount = len(imageFiles)
print("Loaded %s files" % fileCount)

# process subject data
subjectData = groupList(subjectData, "subject", sort=True)
subjectCount = len(subjectData)
mainSubjects = subjectData[:a.
                           MAX_SUBJECTS] if subjectCount > a.MAX_SUBJECTS else subjectData
subjectLabels = [s["subject"] for s in mainSubjects]
Example #7
0
parser.add_argument('-out', dest="OUTPUT_FILE", default="data/metadata.json", help="File for output")
a = parser.parse_args()

tileW, tileH = tuple([int(t) for t in a.TILE_SIZE.split("x")])

gridAssignment = None
with open(a.INPUT_GRID_FILE, "rb") as f:
    gridAssignment = pickle.load(f)
grid, gridShape = gridAssignment
gridW, gridH = gridShape

# Make sure output dirs exist
io.makeDirectories(a.OUTPUT_FILE)

# retrieve data
fieldNames, data = io.readCsv(a.INPUT_FILE)
dataCount = len(data)

# retrieve images
imageFiles = glob.glob(a.IMAGE_FILES)
imageFiles = sorted(imageFiles)
fileCount = len(imageFiles)
print("Loaded %s files" % fileCount)

if fileCount != len(grid):
    print("File count (%s) != grid count (%s)" % (fileCount, len(grid)))
    sys.exit()

# initialize grid
rows = []
for i in range(gridH):
Example #8
0
import sys

import lib.io_utils as io
import lib.list_utils as lu
import lib.math_utils as mu

# input
parser = argparse.ArgumentParser()
parser.add_argument('-in', dest="INPUT_FILE", default="data/MexicoAndCentralAmerica.csv", help="File generated by html_to_csv.py")
parser.add_argument('-out', dest="OUTPUT_FILE", default="data/processed/MexicoAndCentralAmerica_cleaned.csv", help="Output csv file")
a = parser.parse_args()

# Make sure output dirs exist
io.makeDirectories([a.OUTPUT_FILE])

fieldNames, items = io.readCsv(a.INPUT_FILE)
itemCount = len(items)

# this is where the clean data will go
cleanedItems = [{
    "Id": item["Catalog No"],
    "Acquisition Year": "",
    "Acquisition Type": "",
    "Latitude": 0,
    "Longitude": 0,
    "Country": "",
    "Locale": "",
    "Category": "",
    "Hall": ""
} for item in items]
Example #9
0
def getItems(config):
    inputFile = config["metadataFile"]
    idCol = config["identifierColumn"] if "identifierColumn" in config else None

    fieldnames, items = io.readCsv(inputFile, parseNumbers=False)
    if "metadataFilterQuery" in config:
        items = lu.filterByQueryString(items, config["metadataFilterQuery"])
        print("%s items after filtering" % len(items))

    # map year, lat/lon, and category
    columnMap = [("dateColumn", "year"), ("latitudeColumn", "lat"),
                 ("longitudeColumn", "lon"), ("countryColumn", "country"),
                 ("groupByColumn", "category")]
    minimumYear = config["minimumYear"] if "minimumYear" in config else None
    maximumYear = config["maximumYear"] if "maximumYear" in config else None
    validItems = []
    for i, item in enumerate(items):
        validatedItem = item.copy()
        isValid = True
        for configKey, toColumn in columnMap:
            if configKey not in config:
                continue

            value = item[config[configKey]]
            if toColumn == "year":
                value = su.validateYear(value, minimumYear, maximumYear)
            elif toColumn == "lat":
                value = su.validateLat(value)
            elif toColumn == "lon":
                value = su.validateLon(value)
            if value is None:
                isValid = False
                break

            validatedItem[toColumn] = value

        if isValid:
            validItems.append(validatedItem)

    diff = len(items) - len(validItems)
    print(f'Found {diff} invalid items.')

    # Sort so that index corresponds to ID
    if idCol is not None:
        for i, item in enumerate(validItems):
            validItems[i]["_id"] = str(item[idCol])
        validItems = sorted(validItems, key=lambda item: item["_id"])

    validItems = lu.addIndices(validItems)
    if idCol is None:
        for i, item in enumerate(validItems):
            validItems[i]["_id"] = str(i)

    # Retrieve categories
    categories = []
    itemsByCategory = lu.groupList(validItems,
                                   "category",
                                   sort=True,
                                   desc=True)
    if "groupLimit" in config and len(itemsByCategory) > config["groupLimit"]:
        limit = config["groupLimit"] - 1
        otherItems = itemsByCategory[limit:]
        otherLabel = config["otherLabel"] if "otherLabel" in config else "Other"
        otherCount = 0
        for group in otherItems:
            for item in group["items"]:
                validItems[item["index"]]["category"] = otherLabel
                otherCount += 1
        itemsByCategory = itemsByCategory[:limit] + [{
            "category": otherLabel,
            "count": otherCount
        }]
    categoryColors = config["groupColors"]
    colorCount = len(categoryColors)
    for i, category in enumerate(itemsByCategory):
        color = categoryColors[i % colorCount]
        categories.append({
            "text": category["category"],
            "color": color,
            "count": category["count"]
        })

    return (validItems, categories)
    dest="OUTPUT_FILE",
    default="data/processed/MexicoAndCentralAmerica_geocoded.csv",
    help="Output csv file")
parser.add_argument('-wait',
                    dest="WAIT_SECONDS",
                    default=5,
                    type=int,
                    help="Seconds to wait before each request")
a = parser.parse_args()

LOCALE_FIELD = "Locale"

# Make sure output dirs exist
io.makeDirectories([a.OUTPUT_FILE, a.CACHE_FILE])

fieldNames, items = io.readCsv(a.INPUT_FILE)
itemCount = len(items)

for i, item in enumerate(items):
    items[i]["LookupString"] = ""

    # only lookup items that have country and locale with values
    if len(item["Country"]) < 1 or len(item[LOCALE_FIELD]) < 1:
        continue

    items[i]["LookupString"] = item[LOCALE_FIELD] + ", " + item["Country"]

locales = []
localeLookup = {}
if os.path.isfile(a.CACHE_FILE):
    _, locales = io.readCsv(a.CACHE_FILE)