def remove_duplicate_photo_uris_per_file(): """ If a file has the same photo URI multiple times, make a new photo entry with a union of the tags for each one, and the earlier commitdate. TODO: support media duplicates """ max = int(get_max_entity_count()) for file_path in [PANDA_PATH, ZOO_PATH]: section = None for section_name in ["zoos", "pandas"]: if section_name in file_path.split("/"): section = section_name.split("s")[0] # HACK # Enter the pandas subdirectories for root, dirs, files in os.walk(file_path): for filename in files: path = root + os.sep + filename # print(path) photo_list = PhotoFile(section, path) photo_count = photo_list.photo_count() photo_index = 1 seen = {} duplicates = {} while (photo_index <= photo_count): current_option = "photo." + str(photo_index) current_uri = photo_list.get_field(current_option) current_author_option = current_option + ".author" current_author = photo_list.get_field( current_author_option) current_date_option = current_option + ".commitdate" current_date = photo_list.get_field(current_date_option) current_date_value = datetime_to_unixtime(current_date) current_link_option = current_option + ".link" current_link = photo_list.get_field(current_link_option) current_tags_option = current_option + ".tags" current_tags = photo_list.get_field(current_tags_option) if current_uri in seen: # We have a duplicate seen_date_value = datetime_to_unixtime( seen[current_uri]["commitdate"]) seen_tags = seen[current_uri]["tags"] # Resolve dates and tags if (current_date_value < seen_date_value): seen[current_uri][ "commitdate"] = current_date_value # Handle when either of the duplicates have no tags if seen_tags == None and current_tags != None: seen[current_uri]["tags"] = current_tags if seen_tags != None and current_tags != None: tag_list = current_tags.split( ", ") + seen_tags.split(", ") tag_list = sorted(list( dict.fromkeys(tag_list))) # deduplicate tags seen[current_uri]["tags"] = ", ".join(tag_list) # Add to duplicates list in its current form duplicates[current_uri] = seen[current_uri] # Remove from the photo list photo_list.delete_photo(photo_index) photo_list.delete_photo(seen[current_uri]["old_index"]) elif current_uri in duplicates: # We have something duplicated more than once seen_date_value = datetime_to_unixtime( duplicates[current_uri]["commitdate"]) seen_tags = duplicates[current_uri]["tags"] # Resolve dates and tags if (current_date_value < seen_date_value): duplicates[current_uri][ "commitdate"] = current_date_value # Handle when either of the duplicates have no tags if seen_tags == None and current_tags != None: seen[current_uri]["tags"] = current_tags if seen_tags != None and current_tags != None: tag_list = current_tags.split( ", ") + seen_tags.split(", ") tag_list = sorted(list( dict.fromkeys(tag_list))) # deduplicate tags duplicates[current_uri]["tags"] = ", ".join( tag_list) # Remove from the photo list photo_list.delete_photo(photo_index) else: seen[current_uri] = {} seen[current_uri]["old_index"] = photo_index seen[current_uri]["author"] = current_author seen[current_uri]["commitdate"] = current_date seen[current_uri]["link"] = current_link seen[current_uri]["tags"] = current_tags photo_index = photo_index + 1 for photo_uri in duplicates.keys(): # Add duplicates back to photo file, starting at the newest index photo_option = "photo." + str(photo_index) author_option = photo_option + ".author" author = duplicates[photo_uri]["author"] date_option = photo_option + ".commitdate" date = duplicates[photo_uri]["commitdate"] link_option = photo_option + ".link" link = duplicates[photo_uri]["link"] tags_option = photo_option + ".tags" tags = duplicates[photo_uri]["tags"] photo_list.set_field(photo_option, photo_uri) photo_list.set_field(author_option, author) photo_list.set_field(date_option, date) photo_list.set_field(link_option, link) if (tags != None): photo_list.set_field(tags_option, tags) photo_index = photo_index + 1 # Update the file if there were any changes, and re-sort the hashes duplicate_count = len(duplicates.keys()) if duplicate_count > 0: print("deduplicated: %s (%s duplicated)" % (path, duplicate_count)) photo_list.renumber_photos(max) photo_list.update_file() sort_ig_hashes(path)
def update_entity_commit_dates(starting_commit, force=False): """ When moving pandas, the old redpandafinder updater logic considered "new" animals as anything that was a new file in a location. So when an animal moved zoos, it became _new_ again. Rectify this by tracking when the commitdate for each new animal is. Track commit dates for other files too, just for the hell of it. """ filename_to_commit_date = {} type_id_to_commit_date = {} repo = git.Repo(".") # List of sha1-name commits from the repo, oldest to newest commit_list = list( reversed(list(map(lambda x: x.hexsha, repo.iter_commits())))) if starting_commit != None: try: index = commit_list.index(starting_commit) except IndexError as e: raise CommitError("%s not a valid commit in this repo." % starting_commit) commit_list = commit_list[ index:] # All after, and including the given commit for index, commitish in enumerate(commit_list): # End of the commit list? Call it a day if commitish == commit_list[len(commit_list) - 1]: break # Get the diff start = commitish end = commit_list[index + 1] diff_raw = repo.git.diff(start, end, ignore_blank_lines=True, ignore_space_at_eol=True) patch = PatchSet(diff_raw) for change in patch: filename = change.path if filename.find(".txt") == -1: # Don't care about non-data files continue elif change.is_added_file == True: compare = "./" + filename dt = repo.commit(end).committed_datetime date = str(dt.year) + "/" + str(dt.month) + "/" + str(dt.day) just_file = filename.split("/").pop() just_type = None just_id = None if compare.find(PANDA_PATH) == 0: just_type = "panda" just_id = just_file.split("_")[0] elif compare.find(ZOO_PATH) == 0: just_type = "zoo" just_id = just_file.split("_")[0] elif compare.find(MEDIA_PATH) == 0: just_type = "media" just_id = filename # Need full path for media files else: continue # Not a file we're tracking commitdates for filename_to_commit_date[just_file] = date type_id_to_commit_date[just_type + "_" + just_id] = date else: continue # print(str(filename_to_commit_date)) # print(str(type_id_to_commit_date)) # Now walk the repo, find all panda files without commit dates, # and add commitdate to each photo that needs one for file_path in [MEDIA_PATH, PANDA_PATH, ZOO_PATH]: section = None for section_name in ["media", "zoos", "pandas"]: if section_name in file_path.split("/"): section = section_name.split("s")[0] # HACK # Enter the pandas subdirectories for root, dirs, files in os.walk(file_path): for filename in files: path = root + os.sep + filename photo_list = PhotoFile(section, path) if photo_list.get_field("commitdate") == None: if filename not in filename_to_commit_date: # file's name was changed at some point just_file = filename.split("/").pop() just_type = None just_id = None if path.find(PANDA_PATH) == 0: just_type = "panda" just_id = just_file.split("_")[0] elif path.find(ZOO_PATH) == 0: just_type = "zoo" just_id = just_file.split("_")[0] elif path.find(MEDIA_PATH) == 0: just_type = "media" just_id = path # Need full path for media files else: continue # Not a file we're tracking commitdates for just_key = just_type + "_" + just_id if just_key not in type_id_to_commit_date: print("warning: %s commitdate undetermined" % filename) continue else: date = type_id_to_commit_date[just_key] old_date = photo_list.get_field("commitdate") if ((old_date == None) or (force == True)): photo_list.set_field("commitdate", date) else: date = filename_to_commit_date[filename] old_date = photo_list.get_field("commitdate") if ((old_date == None) or (force == True)): photo_list.set_field("commitdate", date) photo_list.update_file()
def update_photo_commit_dates(starting_commit, force=False): """ The old redpandafinder update logic only worked on the basis of commits in the last week or so. When files are re-sorted, added, or removed for periods of time, it becomes meaningful to search the entire git repo, find when a photo URI first appeared, and then track it using its first commit-date into redpandafinder. """ uri_to_commit_date = {} repo = git.Repo(".") # List of sha1-name commits from the repo, oldest to newest commit_list = list( reversed(list(map(lambda x: x.hexsha, repo.iter_commits())))) if starting_commit != None: try: index = commit_list.index(starting_commit) except IndexError as e: raise CommitError("%s not a valid commit in this repo." % starting_commit) commit_list = commit_list[ index:] # All after, and including the given commit for index, commitish in enumerate(commit_list): # End of the commit list? Call it a day if commitish == commit_list[len(commit_list) - 1]: break # Get the diff start = commitish end = commit_list[index + 1] diff_raw = repo.git.diff(start, end, ignore_blank_lines=True, ignore_space_at_eol=True) patch = PatchSet(diff_raw) for change in patch: filename = change.path if filename.find(".txt") == -1: # Don't care about non-data files continue elif change.added <= 0: # No lines were added, so we don't care continue else: for hunk in change: for line in hunk: if line.is_added: if re.match("photo.\d+:", line.value) == None: # Not a photo line continue if line.value.find(": ") == -1: # No correct delimiter, which we see in old commits continue if len(line.value.strip().split(": ")) != 2: # Probably bad linebreaks continue [key, value] = line.value.strip().split(": ") if (value in uri_to_commit_date): # Photo we've already seen continue if (value.find("http") != 0) and (value.find("ig://") != 0): # Not a URI, so not a photo reference continue dt = repo.commit(end).committed_datetime date = str(dt.year) + "/" + str( dt.month) + "/" + str(dt.day) if value not in uri_to_commit_date: # Only insert a comit date once uri_to_commit_date[value] = date # print(str(uri_to_commit_date)) # Now walk the repo, find all files with photo lines that have no commit dates, # and add commitdate to each photo that needs one for file_path in [PANDA_PATH, ZOO_PATH, MEDIA_PATH]: section = None for section_name in ["media", "zoos", "pandas"]: if section_name in file_path.split("/"): section = section_name.split("s")[0] # HACK # Enter the pandas subdirectories for root, dirs, files in os.walk(file_path): for filename in files: path = root + os.sep + filename # print(path) photo_list = PhotoFile(section, path) photo_count = photo_list.photo_count() photo_index = 1 while (photo_index <= photo_count): photo_option = "photo." + str(photo_index) photo_uri = photo_list.get_field(photo_option) date_option = photo_option + ".commitdate" if photo_uri not in uri_to_commit_date: photo_index = photo_index + 1 continue date_value = uri_to_commit_date[photo_uri] old_date_value = photo_list.get_field(date_option) if ((old_date_value == None) or (force == True)): photo_list.set_field(date_option, date_value) # print(photo_uri + " ==> " + date_value) photo_index = photo_index + 1 photo_list.update_file()
def sort_ig_hashes(path): """ Take a zoo/panda file, and sort all photos by their IG hashes. This makes the photos appear in the order they were uploaded to IG, oldest to newest. If a photo does not use an IG URI, keep its index unchanged. """ # IG alphabet for hashes, time ordering oldest to newest # print(path) print("sorting: %s" % path) hash_order = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" section = None for section_name in ["wild", "zoos", "media", "pandas"]: if section_name in path.split("/"): section = section_name.split("s")[0] # HACK photo_list = PhotoFile(section, path) photo_count = photo_list.photo_count() max = int(get_max_entity_count()) + 1 if photo_count >= max: max = photo_count + 1 non_ig_indices = [] ig_photos = [] # Build photo indices of IG photos and non-IG photos start_index = 1 stop_point = max photo_index = start_index while photo_index <= stop_point: photo_option = "photo." + str(photo_index) photo = photo_list.get_field(photo_option) if photo == None: # Missing photo at this index, continue photo_index = photo_index + 1 continue # Convert IG photo formats to use new event handler photo = update_ig_link(photo) photo_list.set_field(photo_option, photo) # If our updated photo link has an ig:// uri, do the moving if "ig://" in photo: # Track the photo and index as a tuple ig_photos.append([photo, photo_index]) # Rename all photo fields as "old_photo_field" photo_list.move_field("old." + photo_option, photo_option) photo_list.move_field("old." + photo_option + ".author", photo_option + ".author") photo_list.move_field("old." + photo_option + ".commitdate", photo_option + ".commitdate") photo_list.move_field("old." + photo_option + ".link", photo_option + ".link") photo_list.move_field("old." + photo_option + ".tags", photo_option + ".tags") if section == "media": panda_tags = photo_list.get_field("panda.tags").split(", ") for panda_id in panda_tags: photo_item = photo_option + ".tags." + panda_id + ".location" photo_list.move_field("old." + photo_item, photo_item) else: # Track the non-ig index, so we can avoid it # Don't need to rename these photos non_ig_indices.append(photo_index) photo_index = photo_index + 1 # Sort the list of ig photo tuples by photo URL # (the 0th item in each tuple is the url) # (the 4th item in each URL is the ig photo hash) ig_photos = sorted( ig_photos, key=lambda x: [hash_order.index(char) for char in x[0].split("/")[2]]) ig_photos = sorted(ig_photos, key=lambda x: len(x[0].split("/")[2])) # Now, re-distribute the photos, iterating down the ig # photos, moving "old_photo_field" to "photo_field" but with # updated indices list_index = start_index photo_index = start_index used_indices = [] while photo_index <= stop_point: if list_index - 1 == len(ig_photos): # No more photos, for certain break [photo, old_index] = ig_photos[list_index - 1] photo_index = list_index while photo_index in non_ig_indices: photo_index = photo_index + 1 # Avoid indices for non-IG photos while photo_index in used_indices: photo_index = photo_index + 1 # Avoid indices we already used used_indices.append(photo_index) current_option = "photo." + str(photo_index) old_option = "old.photo." + str(old_index) photo_list.move_field(current_option, old_option) photo_list.move_field(current_option + ".author", old_option + ".author") photo_list.move_field(current_option + ".commitdate", old_option + ".commitdate") photo_list.move_field(current_option + ".link", old_option + ".link") photo_list.move_field(current_option + ".tags", old_option + ".tags") if section == "media": panda_tags = photo_list.get_field("panda.tags").split(", ") for panda_id in panda_tags: current_loc_tag = current_option + ".tags." + panda_id + ".location" old_loc_tag = old_option + ".tags." + panda_id + ".location" photo_list.move_field(current_loc_tag, old_loc_tag) list_index = list_index + 1 # We're done. Update the photo file photo_list.update_file()