def push_new_row_subjects(self, source_subject, target_subject_set_id, row_paths_by_column): """ Given image paths for the new column-indexed rows (row_paths_by_column), push new unclassified row subjects to the appropriate subject set, with metadata references to the source subject and column. """ project = Project.find(settings.PROJECT_ID) subject_set_unclassified_rows = SubjectSet.find(target_subject_set_id) new_row_subjects = [] for column_index, row_paths in row_paths_by_column.items(): self._logger.info('Creating %d new row subjects for column index %d for subject %s', len(row_paths), column_index, source_subject.id) for row_path in row_paths: new_subject = Subject() new_subject.links.project = project copy_source_metadata_fields = ['book', 'page'] for copy_field in copy_source_metadata_fields: new_subject.metadata[copy_field] = source_subject.metadata[copy_field] new_subject.metadata['source_document_subject_id'] = source_subject.id new_subject.metadata['source_document_column_index'] = column_index new_subject.add_location(row_path) new_subject.save() new_row_subjects.append(new_subject) subject_set_unclassified_rows.add(new_row_subjects)
def add_new_subject(self, image_list, metadata_list, subject_set_name): """ Add a subject and the metadata. image_list and metadata_list must be of equal length :param image_list: list of images to be added :param metadata_list: list of metadata to be added :return: """ # Start by making sure we have two equal length list if len(image_list) != len(metadata_list): print("Image list and metadata list do not match") # Link to the subject set we want subject_set = SubjectSet() subject_set.links.project = self.project subject_set.display_name = subject_set_name subject_set.save() # Go through the image and metadata list and add the items new_subjects = [] for i in range(len(image_list)): subject = Subject() subject.links.project = self.project subject.add_location(image_list[i]) subject.metadata.update(metadata_list[i]) subject.save() new_subjects.append(subject) subject_set.add(new_subjects)
def save_subject(manifest_item, project, pbar=None): """ Add manifest item to project. Note: follow with subject_set.add(subject) to associate with subject set. Args: manifest_item (dict): of form {png_loc: img.png, key_data: some_data_dict} project (str): project to upload subject too e.g. '5773' for Galaxy Zoo pbar (tqdm.tqdm): progress bar to update. If None, no bar will display. Returns: None """ subject = Subject() subject.links.project = project assert os.path.exists(manifest_item['png_loc']) subject.add_location(manifest_item['png_loc']) subject.metadata.update(manifest_item['key_data']) subject.save() if pbar: pbar.update() return subject
def pushSubject(subjectSet, project, imageLocations, metadata, livePost): if (livePost): subject = Subject() subject.links.project = project for image in imageLocations: subject.add_location(image) subject.metadata.update(metadata) notSaved = True while (notSaved): notSaved = False try: subject.save() except ConnectionError as e: print('{} , TRYING AGAIN'.format(e)) notSaved = True subjectSet.add(subject) return subject else: return None
def make_tutorial_images(imagePaths, ellipseData, projectData): # Connect to Panoptes Panoptes.connect( username=projectData["user_name"], password=projectData["password"] ) newSubjects = [] for imageId, imagePath in enumerate(imagePaths): print(f"Adding {imagePath}...") try: subjectSet = SubjectSet.find(projectData["subject_set"]) except PanoptesAPIException as e: print(e) return newSubject = Subject() newSubject.add_location(imagePath) newSubject.links.project = subjectSet.links.project newSubject.metadata.update( make_metadata( ellipseData.get_group(imageId).reset_index(drop=True), imagePath ) ) newSubject.save() newSubjects.append(newSubject) subjectSet.add(newSubjects)
def create_subject(project, metadata, media_files): subject = Subject() subject.links.project = project for media_file in media_files: subject.add_location(media_file) subject.metadata.update(metadata) subject.save() return subject
def _create_subject(self, project_id, filename, metadata=None): subject = Subject() subject.links.project = Project.find(project_id) subject.add_location(filename) if metadata: subject.metadata.update(metadata) subject.save() return subject
def create_subject(project, media_files, metadata): """ Create a subject Args: - project: a Project() object defining the Zooniverse project - media_files: a list of media files to link to the subject - metadata: a dictionary with metadata to attach """ subject = Subject() subject.links.project = project for media in media_files: subject.add_location(media) subject.metadata.update(metadata) subject.save() return subject
def upload_subject(locations: List, project: Project, subject_set_name: str, metadata: Dict): subject = Subject() # add files subject.links.project = project for location in locations: if not os.path.isfile(location): raise FileNotFoundError( 'Missing subject location: {}'.format(location)) subject.add_location(location) subject.metadata.update(metadata) subject_set_name = subject_set_name subject_set = get_or_create_subject_set(project.id, subject_set_name) subject.save() subject_set.add(subject) return subject.id
def upload_images(id, use_database=True): print('Create subject set and upload images for', id) if use_database: update_status(id, gz_status='Uploading') wd = os.getcwd() Panoptes.connect(username='******', password=os.environ['PANOPTES_PASSWORD']) os.chdir(target + id) project = Project.find(slug='chrismrp/radio-galaxy-zoo-lofar') subject_set = SubjectSet() subject_set.display_name = id subject_set.links.project = project subject_set.save() print('Made subject set') new_subjects = [] g = glob.glob('*-manifest.txt') for i, f in enumerate(g): bits = open(f).readlines()[0].split(',') metadata = { 'subject_id': int(bits[0]), 'ra': float(bits[5]), 'dec': float(bits[6]), '#size': float(bits[7]), 'source_name': bits[4] } print('Upload doing', bits[4], '%i/%i' % (i, len(g))) subject = Subject() subject.links.project = project subject.metadata.update(metadata) for location in bits[1:4]: subject.add_location(location) subject.save() new_subjects.append(subject) subject_set.add(new_subjects) workflow = Workflow(11973) workflow.links.subject_sets.add(subject_set) if use_database: update_status(id, gz_status='In progress') print('Done!')
def upload_subjects(subject_set_id, manifest_file): subject_set = SubjectSet.find(subject_set_id) subject_rows = [] with open(manifest_file) as manifest_f: file_root = os.path.dirname(manifest_file) r = csv.reader(manifest_f) headers = r.next() for row in r: metadata = dict(zip(headers, row)) files = [] for col in row: file_match = re.match(IMAGE_REGEX, col) file_path = os.path.join(file_root, col) if file_match and os.path.exists(file_path): files.append(file_path) if len(files) == 0: click.echo('Could not find any files in row:', err=True) click.echo(','.join(row), err=True) return -1 subject_rows.append((files, metadata)) created_subjects = [] with click.progressbar( enumerate(subject_rows), length=len(subject_rows), label='Uploading subjects', ) as _subject_rows: for count, (files, metadata) in _subject_rows: subject = Subject() subject.links.project = subject_set.links.project map(subject.add_location, files) subject.metadata.update(metadata) subject.save() created_subjects.append(subject) if (count + 1) % LINK_BATCH_SIZE == 0: subject_set.add(created_subjects) created_subjects = [] if len(created_subjects) > 0: subject_set.add(created_subjects)
def _create_subjects_from_epicollect5(self, project, subjects_metadata): subjects = list() for metadata in subjects_metadata: subject = Subject() subject.metadata['id'] = metadata['id'] subject.metadata['project'] = metadata['project'] subject.metadata['obs_type'] = metadata['obs_type'] subject.metadata['source'] = metadata['source'] subject.metadata['url'] = metadata['url'] subject.metadata['created_at'] = metadata['created_at'] subject.metadata['observer'] = metadata['observer'] subject.metadata['longitude'] = metadata['location']['longitude'] subject.metadata['latitude'] = metadata['location']['latitude'] subject.metadata['comment'] = metadata['comment'] subject.metadata['spectrum_type'] = metadata.get( 'spectrum_type', "?") subject.add_location({'image/jpg': metadata['url']}) subject.links.project = project subject.save() subjects.append(subject) return subjects
def create_subjects_and_link_to_project(self, proto_subjects, project_id, workflow_id, subject_set_id): try: USERNAME = os.getenv('PANOPTES_USERNAME') PASSWORD = os.getenv('PANOPTES_PASSWORD') Panoptes.connect(username=USERNAME, password=PASSWORD, endpoint=self.ENDPOINT) project = Project.find(project_id) workflow = Workflow().find(workflow_id) if subject_set_id == None: subject_set = SubjectSet() ts = time.gmtime() subject_set.display_name = time.strftime( "%m-%d-%Y %H:%M:%S", ts) subject_set.links.project = project subject_set.save() else: subject_set = SubjectSet().find(subject_set_id) subjects = [] for proto_subject in proto_subjects: subject = Subject() subject.links.project = project subject.add_location(proto_subject['location_lc']) subject.add_location(proto_subject['location_ps']) subject.metadata.update(proto_subject['metadata']) subject.save() subjects.append(subject) subject_set.add(subjects) workflow.add_subject_sets(subject_set) except Exception: self.log.exception("Error in create_subjects_and_link_to_project ")
def create_subjects_and_link_to_project(proto_subjects, project_id, subject_set_id, subject_set_name=None): ''' find the project and relevant subject set. Get the existing subject data and compare to the new proto_subjects. Upload any instances of nbew subjects to the project Keyword Arguments: proto_subjects -- dictionary structure containing subject filepath+filename, and associated metadata project_id -- identifier to find and link with the project subject_set_id -- identifier for the subject set of interest ''' # get the project object project = Project.find(project_id) # set up subject_set if subject_set_id == None: subject_set = SubjectSet() # create empty subject_set subject_set.links.project = project if subject_set_name == None: # if not defined generate a random subject set name to avoid error when a set already exists subject_set_name = 'subject_set_{:02d}_{:02d}_{:04d}_{}'.format( date.day, date.month, date.year, ''.join(generate_random_str())) print("will create a subject set called: {}".format(subject_set_name)) subject_set.display_name = subject_set_name # set the name of the subject set subject_set.save() project.reload() else: subject_set = SubjectSet().find( subject_set_id) # find the existing subject_set existing_subject_set_name = subject_set.display_name # get its name # if you have tried to set the subject set name, check that it matches the name for the chosen subject set id if (subject_set_name != None) and (existing_subject_set_name != subject_set_name): print( "your chosen subject set name does not match the existing name: {}, {}" .format(subject_set_name, existing_subject_set_name)) return -1 else: subject_set_name = existing_subject_set_name print("add to existing subject set: {}".format(subject_set_name)) # Create a list of the existing subject metadata meta_list = [] print("existing subjects:") for subject in subject_set.subjects: print(subject.id, subject.metadata) meta_list.append(subject.metadata) # When making list of subjects to add, check to see if the metadata of the subject you want to add is already in the set print("new subjects:") new_subjects = [] for filename, metadata in proto_subjects.items(): # check if this subject is already in the subject set if np.isin(metadata, meta_list): print("{}, subject already in set".format(metadata)) # In this case we skip over the subject that already exists. # N.B. you may want to remove an existing subject and update it with the new one continue # Otherwise we can add the subject to the new subject list else: subject = Subject() subject.links.project = project subject.add_location(filename) subject.metadata.update(metadata) subject.save() new_subjects.append(subject) print("{}, new subject add to list".format(metadata)) print("new subjects to add: {}".format(new_subjects)) # add the new subject list (data and metadata) to the already defined project subject set subject_set.add(new_subjects) return
def upload_subjects( subject_set_id, manifest_files, allow_missing, remote_location, mime_type, file_column, ): """ Uploads subjects from each of the given MANIFEST_FILES. Example with only local files: $ panoptes subject-set upload-subjects 4667 manifest.csv Local filenames will be automatically detected in the manifest and uploaded, or filename columns can be specified with --file-column. If you are hosting your media yourself, you can put the URLs in the manifest and specify the column number(s): $ panoptes subject-set upload-subjects -r 1 4667 manifest.csv $ panoptes subject-set upload-subjects -r 1 -r 2 4667 manifest.csv Any local files will still be detected and uploaded. """ if ( len(manifest_files) > 1 and any(map(lambda m: m.endswith('.yaml'), manifest_files)) ): click.echo( 'Error: YAML manifests must be processed one at a time.', err=True, ) return -1 elif manifest_files[0].endswith('.yaml'): with open(manifest_files[0], 'r') as yaml_manifest: upload_state = yaml.load(yaml_manifest, Loader=yaml.FullLoader) if upload_state['state_version'] > CURRENT_STATE_VERSION: click.echo( 'Error: {} was generated by a newer version of the Panoptes ' 'CLI and is not compatible with this version.'.format( manifest_files[0], ), err=True, ) return -1 if upload_state['subject_set_id'] != subject_set_id: click.echo( 'Warning: You specified subject set {} but this YAML ' 'manifest is for subject set {}.'.format( subject_set_id, upload_state['subject_set_id'], ), err=True, ) click.confirm( 'Upload {} to subject set {} ({})?'.format( manifest_files[0], subject_set_id, SubjectSet.find(subject_set_id).display_name, ), abort=True ) upload_state['subject_set_id'] = subject_set_id resumed_upload = True else: upload_state = { 'state_version': CURRENT_STATE_VERSION, 'subject_set_id': subject_set_id, 'manifest_files': manifest_files, 'allow_missing': allow_missing, 'remote_location': remote_location, 'mime_type': mime_type, 'file_column': file_column, 'waiting_to_upload': [], 'waiting_to_link': {}, } resumed_upload = False remote_location_count = len(upload_state['remote_location']) mime_type_count = len(upload_state['mime_type']) if remote_location_count > 1 and mime_type_count == 1: upload_state['mime_type'] = ( upload_state['mime_type'] * remote_location_count ) elif remote_location_count > 0 and mime_type_count != remote_location_count: click.echo( 'Error: The number of MIME types given must be either 1 or equal ' 'to the number of remote locations.', err=True, ) return -1 def validate_file(file_path): if not os.path.isfile(file_path): click.echo( 'Error: File "{}" could not be found.'.format( file_path, ), err=True, ) return False file_size = os.path.getsize(file_path) if file_size == 0: click.echo( 'Error: File "{}" is empty.'.format( file_path, ), err=True, ) return False elif file_size > MAX_UPLOAD_FILE_SIZE: click.echo( 'Error: File "{}" is {}, larger than the maximum {}.'.format( file_path, humanize.naturalsize(file_size), humanize.naturalsize(MAX_UPLOAD_FILE_SIZE), ), err=True, ) return False return True subject_set = SubjectSet.find(upload_state['subject_set_id']) if not resumed_upload: subject_rows = [] for manifest_file in upload_state['manifest_files']: with open(manifest_file, 'U') as manifest_f: file_root = os.path.dirname(manifest_file) r = csv.reader(manifest_f, skipinitialspace=True) headers = next(r) for row in r: metadata = dict(zip(headers, row)) files = [] if not upload_state['file_column']: upload_state['file_column'] = [] for field_number, col in enumerate(row, start=1): file_path = os.path.join(file_root, col) if os.path.exists(file_path): upload_state['file_column'].append( field_number, ) if not validate_file(file_path): return -1 files.append(file_path) else: for field_number in upload_state['file_column']: file_path = os.path.join( file_root, row[field_number - 1] ) if not validate_file(file_path): return -1 files.append(file_path) for field_number, _mime_type in zip( upload_state['remote_location'], upload_state['mime_type'], ): files.append({_mime_type: row[field_number - 1]}) if len(files) == 0: click.echo( 'Could not find any files in row:', err=True, ) click.echo(','.join(row), err=True) if not upload_state['allow_missing']: return -1 else: continue subject_rows.append((files, metadata)) if not subject_rows: click.echo( 'File {} did not contain any rows.'.format( manifest_file, ), err=True, ) return -1 subject_rows = list(enumerate(subject_rows)) upload_state['waiting_to_upload'] = copy.deepcopy(subject_rows) else: for subject_id, subject_row in upload_state['waiting_to_link'].items(): try: subject = Subject.find(subject_id) except PanoptesAPIException: upload_state['waiting_to_upload'].append(subject_row) del upload_state['waiting_to_link'][subject_id] subject_rows = copy.deepcopy(upload_state['waiting_to_upload']) pending_subjects = [] def move_created(limit): while len(pending_subjects) > limit: for subject, subject_row in pending_subjects: if subject.async_save_result: pending_subjects.remove((subject, subject_row)) upload_state['waiting_to_upload'].remove(subject_row) upload_state['waiting_to_link'][subject.id] = subject_row time.sleep(0.5) def link_subjects(limit): if len(upload_state['waiting_to_link']) > limit: subject_set.add(list(upload_state['waiting_to_link'].keys())) upload_state['waiting_to_link'].clear() with click.progressbar( subject_rows, length=len(subject_rows), label='Uploading subjects', ) as _subject_rows: try: with Subject.async_saves(): for subject_row in _subject_rows: count, (files, metadata) = subject_row subject = Subject() subject.links.project = subject_set.links.project for media_file in files: subject.add_location(media_file) subject.metadata.update(metadata) subject.save() pending_subjects.append((subject, subject_row)) move_created(MAX_PENDING_SUBJECTS) link_subjects(LINK_BATCH_SIZE) move_created(0) link_subjects(0) finally: if ( len(pending_subjects) > 0 or len(upload_state['waiting_to_link']) > 0 ): click.echo('Error: Upload failed.', err=True) if click.confirm( 'Would you like to save the upload state to resume the ' 'upload later?', default=True, ): while True: state_file_name = 'panoptes-upload-{}.yaml'.format( subject_set_id, ) state_file_name = click.prompt( 'Enter filename to save to', default=state_file_name, ) if not state_file_name.endswith('.yaml'): click.echo( 'Error: File name must end in ".yaml".', err=True, ) if click.confirm( 'Save to {}.yaml?'.format(state_file_name), default=True, ): state_file_name += '.yaml' else: continue if not is_valid_filename(state_file_name): click.echo( 'Error: {} is not a valid file name'.format( state_file_name, ), err=True, ) sanitized_filename = sanitize_filename( state_file_name, ) if click.confirm( 'Save to {}?'.format( sanitized_filename, ), default=True, ): state_file_name = sanitized_filename else: continue if os.path.exists(state_file_name): if not click.confirm( 'File {} already exists. Overwrite?'.format( state_file_name, ), default=False, ): continue break with open(state_file_name, 'w') as state_file: yaml.dump(upload_state, state_file)
images = [a['src'] for a in soup.find_all("img", {"src": re.compile("gstatic.com")})] #print images for img in images: raw_img = urllib2.urlopen(img).read() #add the directory for your image here DIR="images/" cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1 f = open(DIR + image_type + "_"+ str(cntr)+".jpg", 'wb') f.write(raw_img) f.close() print 'Creating image set...' # create the subject set. subject_set = SubjectSet() subject_set.links.project = p subject_set.display_name = "Images of " + thing + '\'s' subject_set.save() print 'Uploading images to Zooniverse...' # add all images to subject set for i in range(1,21): subject = Subject() subject.links.project = p subject.add_location('images/' + str(thing) + '_' + str(i)+'.jpg') subject.save() subject_set.add(subject) print 'Complete.'
} segments.append(segment) print('Item segments transformation complete.') return segments segments = transform_item_segments('https://www.loc.gov/item/' + LIBRARY_OF_CONGRESS_ITEM_ID) Panoptes.connect(username=USERNAME, password=PASSWORD, endpoint=ENDPOINT) project = Project.find(PROJECT) subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = segments[0]['metadata']['Title'] # uses item Title as default subject set name, or feel free to hardcode subject_set.save() print('Begin Zooniverse subject upload...') for segment in segments: subject = Subject() subject.links.project = project subject.add_location(segment['location']) subject.metadata.update(segment['metadata']) subject.save() subject_set.add(subject) print("Zooniverse subject upload complete.")
def upload_chunks(self, chunks: str, project_id: int, set_name: str, zooniverse_login="", zooniverse_pwd="", amount: int = 1000, ignore_errors: bool = False, **kwargs): """Uploads ``amount`` audio chunks from the CSV dataframe `chunks` to a zooniverse project. :param chunks: path to the chunk CSV dataframe :type chunks: [type] :param project_id: zooniverse project id :type project_id: int :param set_name: name of the subject set :type set_name: str :param zooniverse_login: zooniverse login. If not specified, the program attempts to get it from the environment variable ``ZOONIVERSE_LOGIN`` instead, defaults to '' :type zooniverse_login: str, optional :param zooniverse_pwd: zooniverse password. If not specified, the program attempts to get it from the environment variable ``ZOONIVERSE_PWD`` instead, defaults to '' :type zooniverse_pwd: str, optional :param amount: amount of chunks to upload, defaults to 0 :type amount: int, optional """ self.chunks_file = chunks self.get_credentials(zooniverse_login, zooniverse_pwd) metadata_location = os.path.join(self.chunks_file) try: self.chunks = pd.read_csv(metadata_location, index_col="index") except: raise Exception("cannot read chunk metadata from {}.".format( metadata_location)) assert_dataframe("chunks", self.chunks) assert_columns_presence( "chunks", self.chunks, {"recording_filename", "onset", "offset", "uploaded", "mp3"}, ) from panoptes_client import Panoptes, Project, Subject, SubjectSet Panoptes.connect(username=self.zooniverse_login, password=self.zooniverse_pwd) zooniverse_project = Project(project_id) subjects_metadata = [] uploaded = 0 subject_set = None for ss in zooniverse_project.links.subject_sets: if ss.display_name == set_name: subject_set = ss if subject_set is None: subject_set = SubjectSet() subject_set.links.project = zooniverse_project subject_set.display_name = set_name subject_set.save() subjects = [] chunks_to_upload = self.chunks[self.chunks["uploaded"] == False].head( amount) chunks_to_upload = chunks_to_upload.to_dict(orient="index") if len(chunks_to_upload) == 0: print("nothing left to upload.") return for chunk_index in chunks_to_upload: chunk = chunks_to_upload[chunk_index] print("uploading chunk {} ({},{})".format( chunk["recording_filename"], chunk["onset"], chunk["offset"])) subject = Subject() subject.links.project = zooniverse_project subject.add_location( os.path.join(os.path.dirname(self.chunks_file), "chunks", chunk["mp3"])) subject.metadata["date_extracted"] = chunk["date_extracted"] try: subject.save() except Exception as e: print("failed to save chunk {}. an exception has occured:\n{}". format(chunk_index, str(e))) print(traceback.format_exc()) if args.ignore_errors: continue else: print("subject upload halting here.") break subjects.append(subject) chunk["index"] = chunk_index chunk["zooniverse_id"] = str(subject.id) chunk["project_id"] = str(project_id) chunk["subject_set"] = str(subject_set.display_name) chunk["uploaded"] = True subjects_metadata.append(chunk) if len(subjects) == 0: return subject_set.add(subjects) self.chunks.update(pd.DataFrame(subjects_metadata).set_index("index")) self.chunks.to_csv(self.chunks_file)
for img in images: try: s = Subject() s.links.project = project # manifest file if os.path.splitext(img)[1] == ".csv": # upload manifest info.... not sure how this will be set up after second step # move csv to complete images folder shutil.copy(f, completed_images) # make dict out of csv file for upload manifest = csv.DictReader(open(img)) s.metadata.update(manifest) else: # upload image to subject s.add_location(img) s.save() new_subjects.append(s) image_count+=1 except Exception as e: f = open(logfile, "a") t = time.localtime() # move error files into seperate folder os.rename(img, errorfiles + os.path.basename(os.path.normpath(img))) f.write('Unable to upload ' + img + ': ' + str(e) + ' '+time.strftime("%D:%H:%M:%S", t)+'\n\n') f.close() try: # add subjects to subject set subject_set.save() subject_set.add(new_subjects) except Exception as e:
def main(): ap = argparse.ArgumentParser( description= 'Given a list of images, bins them into subject sets of size n') # require file path to read in images ap.add_argument('-f', '--filename', required=True, dest='filename', type=str, help='The name of the file from which to read the images') # optionally require subject set size; defaults to 1000 ap.add_argument( '-n', '--size', required=False, dest='n', type=int, default=1000, help='The maximum number of images a subject set should contain. \ The value should be between 1 and 10000, inclusive') # parse args into variables and check values args = vars(ap.parse_args()) filename = args['filename'] if args['filename'] else None n = args['n'] if args['n'] else None if not (n >= 1 and n <= 10000): raise ValueError('n must be between 1 and 10000, inclusive') # connect to zooniverse Panoptes.connect(username=zooniverse_config.Zooniverse_USERNAME, password=zooniverse_config.Zooniverse_PASS) project = Project.find(zooniverse_config.Project_ID) # connection to mongodb mongoConn = MongoClient(csh_db_config.DB_HOST + ":" + str(csh_db_config.DB_PORT)) cshTransDB = mongoConn[csh_db_config.TRANSCRIPTION_DB_NAME] cshTransDB.authenticate(csh_db_config.TRANSCRIPTION_DB_USER, csh_db_config.TRANSCRIPTION_DB_PASS) cshCollection = cshTransDB[csh_db_config.TRANS_DB_MeetingMinColl] # track subject sets being created subjectSets = [] # get the image filenames in a Python list with open(filename) as handle: filenames = handle.readlines() # divide files into groups of n filegroups = list([e for e in t if e != None] for t in itertools.zip_longest(*([iter(filenames)] * n))) for group in filegroups: displayName = '{:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now()) # create a new subject set subjectSet = SubjectSet() subjectSet.links.project = project subjectSet.display_name = displayName subjectSet.save() subjectSetId = subjectSet.id subjectSets.append(subjectSetId) # create a new subject for each file and add to the subject set for filename in group: # remove trailing '\n' character filename = filename.rstrip() # create a new subject subject = Subject() subject.links.project = project filepath = cshCollection.find_one({'_id': filename})['file']['anonPath'] subject.add_location(filepath) subject.metadata['ID'] = filename subject.save() # add to subject set subjectSet.add(subject) # retrieve and update the record from mongodb updateQuery = { '$set': { 'canCrowdsource': True, 'transcription': { 'numClassifications': 5, 'subjectSetId': subjectSetId, 'status': 'sent' } } } record = cshCollection.find_one_and_update({'_id': filename}, updateQuery) # add subject sets to the workflow workflow = project.links.workflows[0] workflow.add_subject_sets(subjectSets) # print helpful information to the console print('{} subject sets created with the following IDs: {}'.format( len(subjectSets), subjectSets))
def upload_chunks(self, destination, project_slug, set_prefix, zooniverse_login, zooniverse_pwd, batches=0, **kwargs): self.destination = destination metadata_location = os.path.join(self.destination, 'chunks.csv') try: self.chunks = pd.read_csv(metadata_location, index_col='index') except: raise Exception( "cannot read chunk metadata in {}. Check the --destination parameter, and make sure you have extracted chunks before." .format(metadata_location)) Panoptes.connect(username=zooniverse_login, password=zooniverse_pwd) zooniverse_project = Project.find(slug=project_slug) uploaded = 0 for batch, chunks in self.chunks.groupby('batch'): if chunks['uploaded'].all(): continue subjects_metadata = [] subject_set = SubjectSet() subject_set.links.project = zooniverse_project subject_set.display_name = "{}_batch_{}".format(set_prefix, batch) subject_set.save() subjects = [] _chunks = chunks.to_dict(orient='index') for chunk_index in _chunks: chunk = _chunks[chunk_index] print("uploading chunk {} ({},{}) in batch {}".format( chunk['recording'], chunk['onset'], chunk['offset'], batch)) subject = Subject() subject.links.project = zooniverse_project subject.add_location( os.path.join(self.destination, 'chunks', chunk['mp3'])) subject.metadata['date_extracted'] = chunk['date_extracted'] subject.save() subjects.append(subject) chunk['index'] = chunk_index chunk['zooniverse_id'] = subject.id chunk['project_slug'] = project_slug chunk['subject_set'] = str(subject_set.display_name) chunk['uploaded'] = True subjects_metadata.append(chunk) subject_set.add(subjects) self.chunks.update( pd.DataFrame(subjects_metadata).set_index('index')) self.chunks.to_csv(os.path.join(self.destination, 'chunks.csv')) uploaded += 1 if batches > 0 and uploaded >= batches: return
subject_set_new = SubjectSet() subject_set_new.links.project = proj subject_set_new.display_name = new_set_name subject_set_new.save() # iterate through the subjects duplicating them and verifying they are created. k = 0 for old_sub in add_subjects: old_subject = Subject(old_sub) try: new_subject = Subject() new_subject.links.project = proj for loc in old_subject.locations: new_subject.add_location(loc) new_subject.metadata = old_subject.metadata new_subject.save() subject_set_new.add(new_subject) print(new_subject.id, 'duplicated in new set to new set') k += 1 except panoptes_client.panoptes.PanoptesAPIException: print(old_sub, 'did not duplicate correctly', str(sys.exc_info()[1])) print(k, ' subjects linked to subject set ', new_set_name, ' in project ', proj_id) linked = 0 with open(os.getcwd() + os.sep + 'duplicated_subjects.csv', 'wt', newline='', encoding='utf-8') as file: fieldnames = ['subject_id', 'Metadata', 'Locations'] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() subject_set = SubjectSet.where(project_id=proj_id, display_name=new_set_name).next() for subject in subject_set.subjects: linked += 1