def push_new_row_subjects(self, source_subject, target_subject_set_id, row_paths_by_column): """ Given image paths for the new column-indexed rows (row_paths_by_column), push new unclassified row subjects to the appropriate subject set, with metadata references to the source subject and column. """ project = Project.find(settings.PROJECT_ID) subject_set_unclassified_rows = SubjectSet.find(target_subject_set_id) new_row_subjects = [] for column_index, row_paths in row_paths_by_column.items(): self._logger.info('Creating %d new row subjects for column index %d for subject %s', len(row_paths), column_index, source_subject.id) for row_path in row_paths: new_subject = Subject() new_subject.links.project = project copy_source_metadata_fields = ['book', 'page'] for copy_field in copy_source_metadata_fields: new_subject.metadata[copy_field] = source_subject.metadata[copy_field] new_subject.metadata['source_document_subject_id'] = source_subject.id new_subject.metadata['source_document_column_index'] = column_index new_subject.add_location(row_path) new_subject.save() new_row_subjects.append(new_subject) subject_set_unclassified_rows.add(new_row_subjects)
def link_new_set(self, subject_set_id): """ :param subject_set_id: :return: """ workflowSet = Workflow() subject = Subject() subject.links.project = self.project sset = subject.find(subject_set_id) print(1) workflowSet.links.project = self.project print(2) workflowSet.links.sub(sset)
def run(self): """ Migrate segmented railroad rows. """ subjects_and_their_target_sets = self._calculate_target_subject_sets_by_subject( ) segmented_rows_and_their_target_sets = self \ ._segmented_row_target_sets(subjects_and_their_target_sets) additions_by_target_set = defaultdict(list) removals_by_target_set = defaultdict(list) for subject_id, target_subject_set_id in segmented_rows_and_their_target_sets.items( ): # target_subject_set = self._get_subject_set(target_subject_set_id) self._logger.debug('Saving segmented row %d to set: %s', subject_id, target_subject_set_id) subject = Subject.find(subject_id) additions_by_target_set[target_subject_set_id].append(subject) for curr_subject_set in subject.links.subject_sets: removals_by_target_set[curr_subject_set.id].append(subject_id) # Remove to appropriate target sets for target_subject_set_id, new_subjects in additions_by_target_set.items( ): target_subject_set = self._get_subject_set(target_subject_set_id) target_subject_set.add(new_subjects)
def uploadSubjectToSet(project, subjectSet, locationsList, metadataList): print('Uploading {} subjects to {}'.format(len(locationsList), subjectSet)) # imagePath can be string or list, metadata must be same dimension if not len(locationsList) == len(metadataList): print( '\t\033[31mInvalid arguments, locationsList and metadataList', 'must have same length\033[0m' ) return subjects = [] for locations, meta in tqdm(zip(locationsList, metadataList)): # the json subjects need to be added in a more manual way so we can # specify a MIME type subjects.append(Subject()) subjects[-1].links.project = project # comparison between model and image addLocation(subjects[-1], {'application/json': locations[1]}) # actual galaxy image subjects[-1].add_location(locations[0]) # and now just the model addLocation(subjects[-1], {'application/json': locations[2]}) for k, v in meta.items(): subjects[-1].metadata[k] = v try: subjects[-1].save() except RuntimeError: pass subjectSet.add(subjects) return subjectSet
def _hydrate_book_and_page(cls, row): subject = Subject.find(row['subject_id']) subject_model = SubjectModel(subject) subject.metadata['book'] = subject_model['book'] subject.metadata['page'] = subject_model['page'] for field in cls.BOOK_AND_PAGE_FIELDS: if subject.metadata[field] is None: raise ValueError("WARN: None '%s' for subject %d and filepath %s" % field, subject.id, subject.metadata['filepath']) subject.save()
def ls(subject_set_id, quiet, subject_ids): """ Lists subject IDs and their media URLs. """ if subject_ids: for subject_id in subject_ids: subject = Subject.find(subject_id) if quiet: click.echo(subject.id) else: echo_subject(subject) return subjects = Subject.where(subject_set_id=subject_set_id) if quiet: click.echo(" ".join([s.id for s in subjects])) else: for subject in subjects: echo_subject(subject)
def upload_subjects(subject_set_id, manifest_file): subject_set = SubjectSet.find(subject_set_id) subject_rows = [] with open(manifest_file) as manifest_f: file_root = os.path.dirname(manifest_file) r = csv.reader(manifest_f) headers = r.next() for row in r: metadata = dict(zip(headers, row)) files = [] for col in row: file_match = re.match(IMAGE_REGEX, col) file_path = os.path.join(file_root, col) if file_match and os.path.exists(file_path): files.append(file_path) if len(files) == 0: click.echo('Could not find any files in row:', err=True) click.echo(','.join(row), err=True) return -1 subject_rows.append((files, metadata)) created_subjects = [] with click.progressbar( enumerate(subject_rows), length=len(subject_rows), label='Uploading subjects', ) as _subject_rows: for count, (files, metadata) in _subject_rows: subject = Subject() subject.links.project = subject_set.links.project map(subject.add_location, files) subject.metadata.update(metadata) subject.save() created_subjects.append(subject) if (count + 1) % LINK_BATCH_SIZE == 0: subject_set.add(created_subjects) created_subjects = [] if len(created_subjects) > 0: subject_set.add(created_subjects)
def save_subject(manifest_item, project, pbar=None): """ Add manifest item to project. Note: follow with subject_set.add(subject) to associate with subject set. Args: manifest_item (dict): of form {png_loc: img.png, key_data: some_data_dict} project (str): project to upload subject too e.g. '5773' for Galaxy Zoo pbar (tqdm.tqdm): progress bar to update. If None, no bar will display. Returns: None """ subject = Subject() subject.links.project = project assert os.path.exists(manifest_item['png_loc']) subject.add_location(manifest_item['png_loc']) subject.metadata.update(manifest_item['key_data']) subject.save() if pbar: pbar.update() return subject
def add_new_subject(self, image_list, metadata_list, subject_set_name): """ Add a subject and the metadata. image_list and metadata_list must be of equal length :param image_list: list of images to be added :param metadata_list: list of metadata to be added :return: """ # Start by making sure we have two equal length list if len(image_list) != len(metadata_list): print("Image list and metadata list do not match") # Link to the subject set we want subject_set = SubjectSet() subject_set.links.project = self.project subject_set.display_name = subject_set_name subject_set.save() # Go through the image and metadata list and add the items new_subjects = [] for i in range(len(image_list)): subject = Subject() subject.links.project = self.project subject.add_location(image_list[i]) subject.metadata.update(metadata_list[i]) subject.save() new_subjects.append(subject) subject_set.add(new_subjects)
def pushSubject(subjectSet, project, imageLocations, metadata, livePost): if (livePost): subject = Subject() subject.links.project = project for image in imageLocations: subject.add_location(image) subject.metadata.update(metadata) notSaved = True while (notSaved): notSaved = False try: subject.save() except ConnectionError as e: print('{} , TRYING AGAIN'.format(e)) notSaved = True subjectSet.add(subject) return subject else: return None
def make_tutorial_images(imagePaths, ellipseData, projectData): # Connect to Panoptes Panoptes.connect( username=projectData["user_name"], password=projectData["password"] ) newSubjects = [] for imageId, imagePath in enumerate(imagePaths): print(f"Adding {imagePath}...") try: subjectSet = SubjectSet.find(projectData["subject_set"]) except PanoptesAPIException as e: print(e) return newSubject = Subject() newSubject.add_location(imagePath) newSubject.links.project = subjectSet.links.project newSubject.metadata.update( make_metadata( ellipseData.get_group(imageId).reset_index(drop=True), imagePath ) ) newSubject.save() newSubjects.append(newSubject) subjectSet.add(newSubjects)
def create_subject(project, metadata, media_files): subject = Subject() subject.links.project = project for media_file in media_files: subject.add_location(media_file) subject.metadata.update(metadata) subject.save() return subject
def _create_subject(self, project_id, filename, metadata=None): subject = Subject() subject.links.project = Project.find(project_id) subject.add_location(filename) if metadata: subject.metadata.update(metadata) subject.save() return subject
def queue_new_subject_creation(cls, subject_id, vertex_centroids, target_subject_set_id): """ Given subject ID and vertex centroids, fetch subject image and perform segmentation. Static-w/-instance-of-self pattern to support enqueuing in RQ. """ logger = setup_logger(cls.LOGGER_NAME, 'log/queue_operations.log') queue_ops = QueueOperations(logger) subject = Subject.find(subject_id) subject_image_path = queue_ops.fetch_subject_image_to_tmp(subject) column_image_paths = queue_ops.perform_column_segmentation( subject_id, subject_image_path, vertex_centroids ) for column_image_path in column_image_paths: queue_ops.upscale_small_images(column_image_path) row_paths_by_column = queue_ops.perform_row_segmentation(column_image_paths) queue_ops.push_new_row_subjects(subject, target_subject_set_id, row_paths_by_column)
def create_subject(project, media_files, metadata): """ Create a subject Args: - project: a Project() object defining the Zooniverse project - media_files: a list of media files to link to the subject - metadata: a dictionary with metadata to attach """ subject = Subject() subject.links.project = project for media in media_files: subject.add_location(media) subject.metadata.update(metadata) subject.save() return subject
def upload_subject(locations: List, project: Project, subject_set_name: str, metadata: Dict): subject = Subject() # add files subject.links.project = project for location in locations: if not os.path.isfile(location): raise FileNotFoundError( 'Missing subject location: {}'.format(location)) subject.add_location(location) subject.metadata.update(metadata) subject_set_name = subject_set_name subject_set = get_or_create_subject_set(project.id, subject_set_name) subject.save() subject_set.add(subject) return subject.id
def upload_images(id, use_database=True): print('Create subject set and upload images for', id) if use_database: update_status(id, gz_status='Uploading') wd = os.getcwd() Panoptes.connect(username='******', password=os.environ['PANOPTES_PASSWORD']) os.chdir(target + id) project = Project.find(slug='chrismrp/radio-galaxy-zoo-lofar') subject_set = SubjectSet() subject_set.display_name = id subject_set.links.project = project subject_set.save() print('Made subject set') new_subjects = [] g = glob.glob('*-manifest.txt') for i, f in enumerate(g): bits = open(f).readlines()[0].split(',') metadata = { 'subject_id': int(bits[0]), 'ra': float(bits[5]), 'dec': float(bits[6]), '#size': float(bits[7]), 'source_name': bits[4] } print('Upload doing', bits[4], '%i/%i' % (i, len(g))) subject = Subject() subject.links.project = project subject.metadata.update(metadata) for location in bits[1:4]: subject.add_location(location) subject.save() new_subjects.append(subject) subject_set.add(new_subjects) workflow = Workflow(11973) workflow.links.subject_sets.add(subject_set) if use_database: update_status(id, gz_status='In progress') print('Done!')
def create_subjects_and_link_to_project(self, proto_subjects, project_id, workflow_id, subject_set_id): try: USERNAME = os.getenv('PANOPTES_USERNAME') PASSWORD = os.getenv('PANOPTES_PASSWORD') Panoptes.connect(username=USERNAME, password=PASSWORD, endpoint=self.ENDPOINT) project = Project.find(project_id) workflow = Workflow().find(workflow_id) if subject_set_id == None: subject_set = SubjectSet() ts = time.gmtime() subject_set.display_name = time.strftime( "%m-%d-%Y %H:%M:%S", ts) subject_set.links.project = project subject_set.save() else: subject_set = SubjectSet().find(subject_set_id) subjects = [] for proto_subject in proto_subjects: subject = Subject() subject.links.project = project subject.add_location(proto_subject['location_lc']) subject.add_location(proto_subject['location_ps']) subject.metadata.update(proto_subject['metadata']) subject.save() subjects.append(subject) subject_set.add(subjects) workflow.add_subject_sets(subject_set) except Exception: self.log.exception("Error in create_subjects_and_link_to_project ")
def run(): """ Query for completed subjects, calculate kmeans vertex centroids, fetch subject images, split columns by centroids, row segmentatino with Ocropy. """ logger = setup_logger(settings.APP_NAME, 'log/kmeans_and_enqueue_completed_subjects.log', logging.DEBUG) subject_set_csv = SubjectSetCSV() workflow_router = SubjectSetWorkflowRouter(subject_set_csv, settings, logger) pages_raw_subject_ids = subject_set_csv.raw_pages_subject_ids() logger.debug("Running Wires and Rails Workflow Processor") Panoptes.connect(username=settings.PANOPTES_USERNAME, password=settings.PANOPTES_PASSWORD) retired_subject_ids = [] vertices_and_target_subject_sets = [] for _subject_set_id, metadata in settings.COLUMNS_WORKFLOW_METADATA.items( ): logger.debug("Loading vertices / subject retirement info for %(debug_name)s subject set " \ "(subject set id: %(subject_set_id)d; workflow id: %(workflow_id)d; task id: " \ " %(task_id)s", metadata) classification_kwargs = { 'scope': 'project', 'project_id': settings.PROJECT_ID, 'workflow_id': metadata['workflow_id'] } logger.debug("Loading classifications by params %s", str(classification_kwargs)) classifications_records = [ c for c in Classification.where(**classification_kwargs) ] classifications = VertexClassifications(classifications_records, pages_raw_subject_ids) # Aggregate vertex centroids centroids_by_subject = classifications.vertex_centroids( metadata['task_id']) for subject_id, centroids in centroids_by_subject.items(): # Find target subject set ID, or log and skip the subject try: target_subject_set_id = workflow_router \ .target_subject_set_id(subject_id, classifications_records) except UnidentifiedRawSubjectSetException as ex: logger.error(ex.args[0]) continue except SharedMajorityException as ex: # TODO need add'l monitoring for this, e.g. manual report exception logger.error(ex.args[0]) continue vertices_and_target_subject_sets.append( [subject_id, centroids, target_subject_set_id]) # Aggregate retired subjects workflow = Workflow.find(metadata['workflow_id']) retirement_count = workflow.retirement['options']['count'] retired_subject_ids += classifications.retired_subject_ids( metadata['task_id'], retirement_count) logger.debug( 'Retrieved the following subject centroids for image segmentation: %s', str(vertices_and_target_subject_sets)) logger.debug('For the following retired subject IDs: %s', str(retired_subject_ids)) queue = Queue(connection=Redis(host=settings.REDIS_HOST)) for subject_id, centroids, target_subject_set_id in vertices_and_target_subject_sets: if subject_id not in retired_subject_ids: continue subject = Subject.find(subject_id) if settings.METADATA_KEY_ALREADY_PROCESSED in subject.metadata and \ subject.metadata[settings.METADATA_KEY_ALREADY_PROCESSED]: logger.debug('Skipping subject id %d; already processed.', subject_id) continue logger.debug('Enqueuing subjects id: %d', subject_id) queue.enqueue(QueueOperations.queue_new_subject_creation, subject_id, centroids, target_subject_set_id, timeout=2 * 60 * 60) QueueOperations.flag_subject_as_queued(subject)
print('\n', 'It may take a while to recover the names of files previously uploaded, to ensure no duplicates') for subject in subject_set.subjects: previous_subjects.append(subject.metadata['Filename']) except StopIteration: # create a new subject set for the new data and link it to the project above subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = set_name subject_set.save() print('Uploading subjects, this could take a while!') new_subjects = 0 for filename, metadata in subject_metadata.items(): try: if filename not in previous_subjects: subject = Subject() subject.links.project = project subject.add_location(compress(args.image_dir, filename, 960)) subject.metadata.update(metadata) subject.save() subject_set.add(subject.id) new_subjects += 1 except panoptes_client.panoptes.PanoptesAPIException: print('An error occurred during the upload of ', filename) print(new_subjects, 'new subjects created and uploaded') print('Uploading complete, Please wait while the full subject listing is prepared and saved in') output_file = "uploaded_subjects.csv" print('"%s" in the drive with the original images' % output_file)
#!/usr/bin/env python3 """ Un-flag arbitrary subjects as not processed, useful for debugging workflow processing. """ import sys sys.path.insert(0, "..") from panoptes_client import Panoptes, Subject from lib import settings Panoptes.connect(username=settings.PANOPTES_USERNAME, password=settings.PANOPTES_PASSWORD) # SUBJECT_IDS = ['5823821', '5823822'] # SUBJECT_IDS = ['14813279', '14813280', '14813281'] # SUBJECT_IDS = ['15327062','15327056','15327068','15327065'] # Telegraph tests - SUBJECT_IDS = ['15327068', '15327065', '15327062', '15327059', '15327056'] for subject_id in SUBJECT_IDS: subject = Subject.find(subject_id) subject.metadata[settings.METADATA_KEY_ALREADY_PROCESSED] = False subject.save()
def _create_subjects_from_epicollect5(self, project, subjects_metadata): subjects = list() for metadata in subjects_metadata: subject = Subject() subject.metadata['id'] = metadata['id'] subject.metadata['project'] = metadata['project'] subject.metadata['obs_type'] = metadata['obs_type'] subject.metadata['source'] = metadata['source'] subject.metadata['url'] = metadata['url'] subject.metadata['created_at'] = metadata['created_at'] subject.metadata['observer'] = metadata['observer'] subject.metadata['longitude'] = metadata['location']['longitude'] subject.metadata['latitude'] = metadata['location']['latitude'] subject.metadata['comment'] = metadata['comment'] subject.metadata['spectrum_type'] = metadata.get( 'spectrum_type', "?") subject.add_location({'image/jpg': metadata['url']}) subject.links.project = project subject.save() subjects.append(subject) return subjects
if len(files) == 0: raise Exception( 'Error finding PNG files. Did you specify correct station? (' + BASEDIR + 'ZOO/' + station + '/*.png)') metadata = open(BASEDIR + station + '.zoo', 'r') (fft, overlap, color_min, color_max) = metadata.readlines() #Create uploaded directory if necessary dest = BASEDIR + 'ZOO/' + station + '/uploaded/' if not (os.path.isdir(dest)): os.mkdir(dest) for file in files: print "Uploading file %s" % file sys.stdout.flush() subject = Subject() subject.links.project = project subject.add_location(file) # You can set whatever metadata you want, or none at all subject.metadata['filename'] = os.path.basename(file) #TODO subject.metadata['file_start'] = #TODO subject.metadata['sample_rate'] = 5512 subject.metadata['fft'] = fft subject.metadata['overlap'] = overlap subject.metadata['color_min'] = color_min subject.metadata['color_max'] = color_max #TODO subject.metadata['width'] = #TODO subject.metadata['height'] = subject.save() subjects.append(subject) os.rename(file,
with open('config.yaml') as config_f: config = yaml.load(config_f, Loader=yaml.FullLoader) with open(SUBJECT_ID_FILE) as subject_id_f: subject_ids = [ s.strip() for s in subject_id_f.readlines() ] Panoptes.connect(**config) with ChargingBar( 'Updating', max=len(subject_ids), suffix='%(percent).1f%% %(eta_td)s' ) as bar: with Subject.async_saves(): for subject_id in subject_ids: bar.next() subject = Subject.find(subject_id) if '!CERiT' in subject.metadata: continue superwasp_id = subject.metadata.get('Filename', subject.metadata.get('filename')).split('_')[0] coords = superwasp_id.replace('1SWASP', '') coords_quoted = urllib.parse.quote(coords) ra = urllib.parse.quote('{}:{}:{}'.format( coords[1:3], coords[3:5], coords[5:10]
def ls(subject_id): echo_subject(Subject.find(subject_id))
subjects = [] files = glob.glob(BASEDIR+'ZOO/'+station+'/*.png') if len(files) == 0: raise Exception('Error finding PNG files. Did you specify correct station? ('+BASEDIR+'ZOO/'+station+'/*.png)') metadata = open(BASEDIR+station+'.zoo','r') (fft,overlap,color_min,color_max) = metadata.readlines() #Create uploaded directory if necessary dest = BASEDIR+'ZOO/'+station+'/uploaded/' if not(os.path.isdir(dest)): os.mkdir(dest) for file in files: print "Uploading file %s" % file sys.stdout.flush() subject = Subject() subject.links.project = project subject.add_location(file) # You can set whatever metadata you want, or none at all subject.metadata['filename'] = os.path.basename(file) #TODO subject.metadata['file_start'] = #TODO subject.metadata['sample_rate'] = 5512 subject.metadata['fft'] = fft subject.metadata['overlap'] = overlap subject.metadata['color_min'] = color_min subject.metadata['color_max'] = color_max #TODO subject.metadata['width'] = #TODO subject.metadata['height'] = subject.save() subjects.append(subject) os.rename(file,dest+os.path.basename(file)) #move file to uploaded directory
videos_uploaded = 0 for original_file in file_list: # loop throught the file list # test if the file is already uploaded, if so skip it if original_file not in previous_subjects: # get data-time from original video file try: video_data = FFProbe(location + os.sep + original_file) datetime = video_data.metadata['creation_time'] except (IOError, KeyError, TypeError): print('Acquiring exif data for ', original_file, ' failed') datetime = '' # finally we are ready for the actual upload of the modified file: try: subject = Subject() subject.links.project = project compress(location + os.sep + original_file) print('Compressed ', original_file, 'to', os.path.getsize('temp.mp4'), 'bytes, uploading....') subject.add_location('temp.mp4') videos_uploaded += 1 # update the subject metadata (add '#' to the beginning of the field name to hide that field) subject.metadata['Site_Date'] = set_name subject.metadata['Filename'] = original_file subject.metadata['Date_time'] = datetime # nothing is actually uploaded to panoptes until the save is executed. # for testing without actually uploading anything comment out the following two lines subject.save() subject_set.add(subject.id) except panoptes_client.panoptes.PanoptesAPIException:
} segments.append(segment) print('Item segments transformation complete.') return segments segments = transform_item_segments('https://www.loc.gov/item/' + LIBRARY_OF_CONGRESS_ITEM_ID) Panoptes.connect(username=USERNAME, password=PASSWORD, endpoint=ENDPOINT) project = Project.find(PROJECT) subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = segments[0]['metadata']['Title'] # uses item Title as default subject set name, or feel free to hardcode subject_set.save() print('Begin Zooniverse subject upload...') for segment in segments: subject = Subject() subject.links.project = project subject.add_location(segment['location']) subject.metadata.update(segment['metadata']) subject.save() subject_set.add(subject) print("Zooniverse subject upload complete.")
'Enter "n" to cancel this upload, any other key to continue' + '\n') if retry.lower() == 'n': quit() # create a new subject set for the new data and link it to the project above subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = set_name subject_set.save() print('Uploading subjects, this could take a while!') new_subjects = 0 old_subjects = 0 for filename, metadata in subject_metadata.items(): try: if filename not in previous_subjects: subject = Subject() subject.links.project = project subject.add_location(location + os.sep + filename) subject.metadata.update(metadata) subject.save() subject_set.add(subject.id) print(filename) new_subjects += 1 else: old_subjects += 1 except panoptes_client.panoptes.PanoptesAPIException: print('An error occurred during the upload of ', filename) print(new_subjects, 'new subjects created and uploaded', old_subjects, 'already uploaded') uploaded = 0
images = [a['src'] for a in soup.find_all("img", {"src": re.compile("gstatic.com")})] #print images for img in images: raw_img = urllib2.urlopen(img).read() #add the directory for your image here DIR="images/" cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1 f = open(DIR + image_type + "_"+ str(cntr)+".jpg", 'wb') f.write(raw_img) f.close() print 'Creating image set...' # create the subject set. subject_set = SubjectSet() subject_set.links.project = p subject_set.display_name = "Images of " + thing + '\'s' subject_set.save() print 'Uploading images to Zooniverse...' # add all images to subject set for i in range(1,21): subject = Subject() subject.links.project = p subject.add_location('images/' + str(thing) + '_' + str(i)+'.jpg') subject.save() subject_set.add(subject) print 'Complete.'
subject_metadata = {} for f, file in enumerate(files): subject_metadata[file] = {'file': file, 'subject_reference': f} Panoptes.connect(username=username, password=password) # tutorial_project = Project() tutorial_project = Project.find(7699) # tutorial_project.display_name = display_name # tutorial_project.description = description # tutorial_project.primary_language = 'en' # tutorial_project.private =True # tutorial_project.save() subject_set = SubjectSet() subject_set.links.project = tutorial_project subject_set.display_name = subject_name subject_set.save() tutorial_project.reload() print(tutorial_project.links.subject_sets) new_subjects = [] for filename, metadata in tqdm.tqdm(subject_metadata.items()): subject = Subject() subject.links.project = tutorial_project subject.add_location(filename) subject.metadata.update(metadata) subject.save() new_subjects.append(subject) subject_set.add(new_subjects)
def upload_subjects( subject_set_id, manifest_files, allow_missing, remote_location, mime_type, file_column, ): """ Uploads subjects from each of the given MANIFEST_FILES. Example with only local files: $ panoptes subject-set upload-subjects 4667 manifest.csv Local filenames will be automatically detected in the manifest and uploaded, or filename columns can be specified with --file-column. If you are hosting your media yourself, you can put the URLs in the manifest and specify the column number(s): $ panoptes subject-set upload-subjects -r 1 4667 manifest.csv $ panoptes subject-set upload-subjects -r 1 -r 2 4667 manifest.csv Any local files will still be detected and uploaded. """ if ( len(manifest_files) > 1 and any(map(lambda m: m.endswith('.yaml'), manifest_files)) ): click.echo( 'Error: YAML manifests must be processed one at a time.', err=True, ) return -1 elif manifest_files[0].endswith('.yaml'): with open(manifest_files[0], 'r') as yaml_manifest: upload_state = yaml.load(yaml_manifest, Loader=yaml.FullLoader) if upload_state['state_version'] > CURRENT_STATE_VERSION: click.echo( 'Error: {} was generated by a newer version of the Panoptes ' 'CLI and is not compatible with this version.'.format( manifest_files[0], ), err=True, ) return -1 if upload_state['subject_set_id'] != subject_set_id: click.echo( 'Warning: You specified subject set {} but this YAML ' 'manifest is for subject set {}.'.format( subject_set_id, upload_state['subject_set_id'], ), err=True, ) click.confirm( 'Upload {} to subject set {} ({})?'.format( manifest_files[0], subject_set_id, SubjectSet.find(subject_set_id).display_name, ), abort=True ) upload_state['subject_set_id'] = subject_set_id resumed_upload = True else: upload_state = { 'state_version': CURRENT_STATE_VERSION, 'subject_set_id': subject_set_id, 'manifest_files': manifest_files, 'allow_missing': allow_missing, 'remote_location': remote_location, 'mime_type': mime_type, 'file_column': file_column, 'waiting_to_upload': [], 'waiting_to_link': {}, } resumed_upload = False remote_location_count = len(upload_state['remote_location']) mime_type_count = len(upload_state['mime_type']) if remote_location_count > 1 and mime_type_count == 1: upload_state['mime_type'] = ( upload_state['mime_type'] * remote_location_count ) elif remote_location_count > 0 and mime_type_count != remote_location_count: click.echo( 'Error: The number of MIME types given must be either 1 or equal ' 'to the number of remote locations.', err=True, ) return -1 def validate_file(file_path): if not os.path.isfile(file_path): click.echo( 'Error: File "{}" could not be found.'.format( file_path, ), err=True, ) return False file_size = os.path.getsize(file_path) if file_size == 0: click.echo( 'Error: File "{}" is empty.'.format( file_path, ), err=True, ) return False elif file_size > MAX_UPLOAD_FILE_SIZE: click.echo( 'Error: File "{}" is {}, larger than the maximum {}.'.format( file_path, humanize.naturalsize(file_size), humanize.naturalsize(MAX_UPLOAD_FILE_SIZE), ), err=True, ) return False return True subject_set = SubjectSet.find(upload_state['subject_set_id']) if not resumed_upload: subject_rows = [] for manifest_file in upload_state['manifest_files']: with open(manifest_file, 'U') as manifest_f: file_root = os.path.dirname(manifest_file) r = csv.reader(manifest_f, skipinitialspace=True) headers = next(r) for row in r: metadata = dict(zip(headers, row)) files = [] if not upload_state['file_column']: upload_state['file_column'] = [] for field_number, col in enumerate(row, start=1): file_path = os.path.join(file_root, col) if os.path.exists(file_path): upload_state['file_column'].append( field_number, ) if not validate_file(file_path): return -1 files.append(file_path) else: for field_number in upload_state['file_column']: file_path = os.path.join( file_root, row[field_number - 1] ) if not validate_file(file_path): return -1 files.append(file_path) for field_number, _mime_type in zip( upload_state['remote_location'], upload_state['mime_type'], ): files.append({_mime_type: row[field_number - 1]}) if len(files) == 0: click.echo( 'Could not find any files in row:', err=True, ) click.echo(','.join(row), err=True) if not upload_state['allow_missing']: return -1 else: continue subject_rows.append((files, metadata)) if not subject_rows: click.echo( 'File {} did not contain any rows.'.format( manifest_file, ), err=True, ) return -1 subject_rows = list(enumerate(subject_rows)) upload_state['waiting_to_upload'] = copy.deepcopy(subject_rows) else: for subject_id, subject_row in upload_state['waiting_to_link'].items(): try: subject = Subject.find(subject_id) except PanoptesAPIException: upload_state['waiting_to_upload'].append(subject_row) del upload_state['waiting_to_link'][subject_id] subject_rows = copy.deepcopy(upload_state['waiting_to_upload']) pending_subjects = [] def move_created(limit): while len(pending_subjects) > limit: for subject, subject_row in pending_subjects: if subject.async_save_result: pending_subjects.remove((subject, subject_row)) upload_state['waiting_to_upload'].remove(subject_row) upload_state['waiting_to_link'][subject.id] = subject_row time.sleep(0.5) def link_subjects(limit): if len(upload_state['waiting_to_link']) > limit: subject_set.add(list(upload_state['waiting_to_link'].keys())) upload_state['waiting_to_link'].clear() with click.progressbar( subject_rows, length=len(subject_rows), label='Uploading subjects', ) as _subject_rows: try: with Subject.async_saves(): for subject_row in _subject_rows: count, (files, metadata) = subject_row subject = Subject() subject.links.project = subject_set.links.project for media_file in files: subject.add_location(media_file) subject.metadata.update(metadata) subject.save() pending_subjects.append((subject, subject_row)) move_created(MAX_PENDING_SUBJECTS) link_subjects(LINK_BATCH_SIZE) move_created(0) link_subjects(0) finally: if ( len(pending_subjects) > 0 or len(upload_state['waiting_to_link']) > 0 ): click.echo('Error: Upload failed.', err=True) if click.confirm( 'Would you like to save the upload state to resume the ' 'upload later?', default=True, ): while True: state_file_name = 'panoptes-upload-{}.yaml'.format( subject_set_id, ) state_file_name = click.prompt( 'Enter filename to save to', default=state_file_name, ) if not state_file_name.endswith('.yaml'): click.echo( 'Error: File name must end in ".yaml".', err=True, ) if click.confirm( 'Save to {}.yaml?'.format(state_file_name), default=True, ): state_file_name += '.yaml' else: continue if not is_valid_filename(state_file_name): click.echo( 'Error: {} is not a valid file name'.format( state_file_name, ), err=True, ) sanitized_filename = sanitize_filename( state_file_name, ) if click.confirm( 'Save to {}?'.format( sanitized_filename, ), default=True, ): state_file_name = sanitized_filename else: continue if os.path.exists(state_file_name): if not click.confirm( 'File {} already exists. Overwrite?'.format( state_file_name, ), default=False, ): continue break with open(state_file_name, 'w') as state_file: yaml.dump(upload_state, state_file)
subject_set.links.project = project subject_set.display_name = set_name subject_set.save() print('Uploading subjects, This could take a while!') new_subjects = 0 old_subjects = 0 failed_subjects = 0 working_on = [] # loop over the preloaded manifest file for metadata in manifest_list: working_on = [metadata['subject'], metadata['image1']] # test for previously uploaded if metadata['image1'] not in previous_subjects: try: subject = Subject() subject.links.project = project # find the files in the metadata listing and add their locations for file in list(metadata.values())[1:]: if file.find('.jpg') > 0: subject.add_location(directory + os.sep + file) # update subject metadata subject.metadata.update(metadata) # again nothing happens until these wo line below, comment them out for testing subject.save() subject_set.add(subject.id) new_subjects += 1 build_part = '{} successfully uploaded at {}'.format(working_on, str(datetime.now())[0:19]) + '\n' except panoptes_client.panoptes.PanoptesAPIException: failed_subjects += 1 build_part = 'An error occurred during the upload of {}'.format(working_on) + '\n'
def upload_chunks(self, chunks: str, project_id: int, set_name: str, zooniverse_login="", zooniverse_pwd="", amount: int = 1000, ignore_errors: bool = False, **kwargs): """Uploads ``amount`` audio chunks from the CSV dataframe `chunks` to a zooniverse project. :param chunks: path to the chunk CSV dataframe :type chunks: [type] :param project_id: zooniverse project id :type project_id: int :param set_name: name of the subject set :type set_name: str :param zooniverse_login: zooniverse login. If not specified, the program attempts to get it from the environment variable ``ZOONIVERSE_LOGIN`` instead, defaults to '' :type zooniverse_login: str, optional :param zooniverse_pwd: zooniverse password. If not specified, the program attempts to get it from the environment variable ``ZOONIVERSE_PWD`` instead, defaults to '' :type zooniverse_pwd: str, optional :param amount: amount of chunks to upload, defaults to 0 :type amount: int, optional """ self.chunks_file = chunks self.get_credentials(zooniverse_login, zooniverse_pwd) metadata_location = os.path.join(self.chunks_file) try: self.chunks = pd.read_csv(metadata_location, index_col="index") except: raise Exception("cannot read chunk metadata from {}.".format( metadata_location)) assert_dataframe("chunks", self.chunks) assert_columns_presence( "chunks", self.chunks, {"recording_filename", "onset", "offset", "uploaded", "mp3"}, ) from panoptes_client import Panoptes, Project, Subject, SubjectSet Panoptes.connect(username=self.zooniverse_login, password=self.zooniverse_pwd) zooniverse_project = Project(project_id) subjects_metadata = [] uploaded = 0 subject_set = None for ss in zooniverse_project.links.subject_sets: if ss.display_name == set_name: subject_set = ss if subject_set is None: subject_set = SubjectSet() subject_set.links.project = zooniverse_project subject_set.display_name = set_name subject_set.save() subjects = [] chunks_to_upload = self.chunks[self.chunks["uploaded"] == False].head( amount) chunks_to_upload = chunks_to_upload.to_dict(orient="index") if len(chunks_to_upload) == 0: print("nothing left to upload.") return for chunk_index in chunks_to_upload: chunk = chunks_to_upload[chunk_index] print("uploading chunk {} ({},{})".format( chunk["recording_filename"], chunk["onset"], chunk["offset"])) subject = Subject() subject.links.project = zooniverse_project subject.add_location( os.path.join(os.path.dirname(self.chunks_file), "chunks", chunk["mp3"])) subject.metadata["date_extracted"] = chunk["date_extracted"] try: subject.save() except Exception as e: print("failed to save chunk {}. an exception has occured:\n{}". format(chunk_index, str(e))) print(traceback.format_exc()) if args.ignore_errors: continue else: print("subject upload halting here.") break subjects.append(subject) chunk["index"] = chunk_index chunk["zooniverse_id"] = str(subject.id) chunk["project_id"] = str(project_id) chunk["subject_set"] = str(subject_set.display_name) chunk["uploaded"] = True subjects_metadata.append(chunk) if len(subjects) == 0: return subject_set.add(subjects) self.chunks.update(pd.DataFrame(subjects_metadata).set_index("index")) self.chunks.to_csv(self.chunks_file)
if os.path.isfile('./manga_mpl4_cutouts/cutouts/{0}.jpg'.format(row['MANGAID'].decode('utf-8'))): if counter < 75: if np.isnan(row['t01_smooth_or_features_a02_features_or_disk_weighted_fraction']): pbar = 'NaN' pspiral = 'NaN' dr8id = 'NaN' dr7id = 'NaN' specid = 'NaN' else: pbar = row['t01_smooth_or_features_a02_features_or_disk_weighted_fraction']*row['t02_edgeon_a05_no_weighted_fraction']*row['t03_bar_a06_bar_weighted_fraction'] pspiral = row['t01_smooth_or_features_a02_features_or_disk_weighted_fraction']*row['t02_edgeon_a05_no_weighted_fraction']*row['t04_spiral_a08_spiral_weighted_fraction'] dr8id = row['dr8objid'] dr7id = row['dr7objid'] specid = row['specobjid'] summer += 1 subject = Subject() subject.links.project = project subject.add_location('./manga_mpl4_cutouts/cutouts/{0}.jpg'.format(row['MANGAID'].decode('utf-8'))) subject.metadata['RA'] = row['RA'] subject.metadata['DEC'] = row['DEC'] subject.metadata['MANGAID'] = row['MANGAID'].decode('utf-8') subject.metadata['Z'] = row['Z'] subject.metadata['PETROTH50'] = row['PETROTH50'] subject.metadata['#MANGA_TILEID'] = row['MANGA_TILEID'] subject.metadata['#NSAID'] = row['NSAID'] subject.metadata['#SERSIC_TH50'] = row['SERSIC_TH50'] subject.metadata['#P(Bar)'] = pbar subject.metadata['#P(Spiral)'] = pspiral subject.metadata['#specobjid'] = specid subject.metadata['#dr8objid'] = dr8id subject.metadata['#dr7objid'] = dr7id