def add_subject_set(self, display_name, subjects_metadata): ''' Create and Add a new subject set to a workflow returned by get_workflows_summary() ''' project = self._project subject_set = SubjectSet() subject_set.display_name = display_name subject_set.links.project = project subject_set.save() source = subjects_metadata[0]['source'] if source == self.EPICOLLECT5_SOURCE: self.log.info( f"Creating {len(subjects_metadata)} subjects to Subject Set {display_name}" ) subjects = self._create_subjects_from_epicollect5( project, subjects_metadata) else: raise NotImplementedError() subject_set.add(subjects) for workflow in project.links.workflows: workflow.add_subject_sets(subject_set) self.log.info( f"Added new Subject Set '{display_name}' to workflow '{workflow.display_name}'" )
def create_subject_set(docref, name): print("Attempting to create a subject set via the Zooniverse API") subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = docref + " - " + name subject_set.save() return subject_set
def add_new_subject(self, image_list, metadata_list, subject_set_name): """ Add a subject and the metadata. image_list and metadata_list must be of equal length :param image_list: list of images to be added :param metadata_list: list of metadata to be added :return: """ # Start by making sure we have two equal length list if len(image_list) != len(metadata_list): print("Image list and metadata list do not match") # Link to the subject set we want subject_set = SubjectSet() subject_set.links.project = self.project subject_set.display_name = subject_set_name subject_set.save() # Go through the image and metadata list and add the items new_subjects = [] for i in range(len(image_list)): subject = Subject() subject.links.project = self.project subject.add_location(image_list[i]) subject.metadata.update(metadata_list[i]) subject.save() new_subjects.append(subject) subject_set.add(new_subjects)
def create_subject_set(project, subject_set_name): # Create a new subject set new_set = SubjectSet() new_set.links.project = project new_set.display_name = subject_set_name new_set.save() project.add_subject_sets(new_set) return new_set
def upload_manifest_to_galaxy_zoo(subject_set_name, manifest, galaxy_zoo_id='5733', n_processes=10): """ Save manifest (set of galaxies with metadata prepared) to Galaxy Zoo Args: subject_set_name (str): name for subject set manifest (list): containing dicts of form {png_loc: img.png, key_data: {metadata_col: metadata_value}} galaxy_zoo_id (str): panoptes project id e.g. '5733' for Galaxy Zoo, '6490' for mobile n_processes (int): number of processes with which to upload galaxies in parallel Returns: None """ if 'TEST' in subject_set_name: logging.warning('Testing mode detected - not uploading!') return manifest if galaxy_zoo_id == '5733': logging.info('Uploading to Galaxy Zoo project 5733') elif galaxy_zoo_id == '6490': logging.info('Uploading to mobile app project 6490') else: logging.info('Uploading to unknown project {}'.format(galaxy_zoo_id)) # Important - don't commit the password! zooniverse_login = read_data_from_txt(zooniverse_login_loc) Panoptes.connect(**zooniverse_login) galaxy_zoo = Project.find(galaxy_zoo_id) subject_set = SubjectSet() subject_set.links.project = galaxy_zoo subject_set.display_name = subject_set_name subject_set.save() pbar = tqdm(total=len(manifest), unit=' subjects uploaded') save_subject_params = {'project': galaxy_zoo, 'pbar': pbar} save_subject_partial = functools.partial(save_subject, **save_subject_params) pool = ThreadPool(n_processes) new_subjects = pool.map(save_subject_partial, manifest) pbar.close() pool.close() pool.join() # new_subjects = [] # for subject in manifest: # print(subject) # new_subjects.append(save_subject_partial(subject)) subject_set.add(new_subjects) return manifest # for debugging only
def _create_subject_set(self, project_id, subject_set_name): project = Project.find(project_id) subject_set = SubjectSet() subject_set.display_name = subject_set_name subject_set.links.project = project subject_set.save() return subject_set
def main(production=False): uname = input('Enter your username: '******'https://panoptes-staging.zooniverse.org', admin=True ) pId = 5733 # if production else 1820 project = Project.find(pId) subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = 'Test_subject_set_' + str(int(time.time())) subject_set.save() loc = os.path.abspath(os.path.dirname(__file__)) subjects = os.listdir(loc + '/subjects') images, differences, model, metadata = [ sorted(( int(re.match(r'{}_([0-9]+)\.(?:json|png)$'.format(s), i).group(1)) for i in subjects if re.match(r'{}_([0-9]+)\.(?:json|png)$'.format(s), i) )) for s in ('difference', 'image', 'model', 'metadata') ] if not images == differences == model == metadata: print( 'Images, differences, model and metadata ' + 'must all have same length' ) # TODO: change subject directory structure to be more efficient # (not having 12,000+ files in a folder...) for i in images: try: with open('{}/subjects/metadata_{}.json'.format(loc, i)) as f: metadata = json.load(f) except IOError: metadata = {} subject_set = uploadSubjectToSet( project, subject_set, [[j.format(loc, i) for j in ( '{}/subjects/image_{}.png', '{}/subjects/difference_{}.json', '{}/subjects/model_{}.json' )]], # locations [metadata], )
def createSubjectSet(subjName, project): #Create the subject set subjectSet = SubjectSet() #Link to the appropriate project subjectSet.links.project = project #Set display name of subject set subjectSet.display_name = subjName #Save subject set to the project subjectSet.save() return subjectSet
def create_subject_set(folder_name, set_name='test_subject_set'): subject_names = [ i.group(1) for i in ( re.match(r'image_(.*?).png', f) for f in os.listdir(folder_name) ) if i is not None ] files = [ [ join(folder_name, file_name) for file_name in ( 'image_{}.png'.format(subject_name), 'difference_{}.json'.format(subject_name), 'model_{}.json'.format(subject_name), 'metadata_{}.json'.format(subject_name), ) ] for subject_name in subject_names ] assert all(os.path.exists(j) for i in files for j in i), 'Missing files!' uname = input('Enter your username: ') pwd = getpass.getpass() Panoptes.connect( username=uname, password=pwd, admin=True ) pId = 5590 project = Project.find(pId) subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = set_name subject_set.save() metadata_list = [] for fs in files: try: with open(fs[3]) as metaF: metadata = json.load(metaF) except IOError: metadata = {} metadata_list.append(metadata) subject_set = uploadSubjectToSet( project, subject_set, [i[:3] for i in files], metadata_list, )
def create(quiet, project_id, display_name): """ Creates a new subject set. Prints the subject set ID and name of the new subject set. """ subject_set = SubjectSet() subject_set.links.project = project_id subject_set.display_name = display_name subject_set.save() if quiet: click.echo(subject_set.id) else: echo_subject_set(subject_set)
def upload_images(id, use_database=True): print('Create subject set and upload images for', id) if use_database: update_status(id, gz_status='Uploading') wd = os.getcwd() Panoptes.connect(username='******', password=os.environ['PANOPTES_PASSWORD']) os.chdir(target + id) project = Project.find(slug='chrismrp/radio-galaxy-zoo-lofar') subject_set = SubjectSet() subject_set.display_name = id subject_set.links.project = project subject_set.save() print('Made subject set') new_subjects = [] g = glob.glob('*-manifest.txt') for i, f in enumerate(g): bits = open(f).readlines()[0].split(',') metadata = { 'subject_id': int(bits[0]), 'ra': float(bits[5]), 'dec': float(bits[6]), '#size': float(bits[7]), 'source_name': bits[4] } print('Upload doing', bits[4], '%i/%i' % (i, len(g))) subject = Subject() subject.links.project = project subject.metadata.update(metadata) for location in bits[1:4]: subject.add_location(location) subject.save() new_subjects.append(subject) subject_set.add(new_subjects) workflow = Workflow(11973) workflow.links.subject_sets.add(subject_set) if use_database: update_status(id, gz_status='In progress') print('Done!')
def main(production=False): uname = input('Enter your username: '******'https://panoptes-staging.zooniverse.org', admin=True) pId = 5590 if production else 1820 project = Project.find(pId) subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = 'Test_subject_set_' + str(int(time.time())) subject_set.save() loc = os.path.abspath(os.path.dirname(__file__)) subjects = os.listdir(loc + '/subjects') # TODO: change subject directory structure to be more efficient # (not having 12,000+ files in a folder...) for i in range(20): if 'image_{}.png'.format(i) in subjects: try: with open('{}/subjects/metadata_{}.json'.format(loc, i)) as f: metadata = json.load(f) except IOError: metadata = {} subject_set = uploadSubjectToSet( project, subject_set, [[ j.format(loc, i) for j in ('{}/subjects/image_{}.png', '{}/subjects/difference_{}.json', '{}/subjects/model_{}.json') ]], # locations [metadata], ) else: break
def create_subjects_and_link_to_project(self, proto_subjects, project_id, workflow_id, subject_set_id): try: USERNAME = os.getenv('PANOPTES_USERNAME') PASSWORD = os.getenv('PANOPTES_PASSWORD') Panoptes.connect(username=USERNAME, password=PASSWORD, endpoint=self.ENDPOINT) project = Project.find(project_id) workflow = Workflow().find(workflow_id) if subject_set_id == None: subject_set = SubjectSet() ts = time.gmtime() subject_set.display_name = time.strftime( "%m-%d-%Y %H:%M:%S", ts) subject_set.links.project = project subject_set.save() else: subject_set = SubjectSet().find(subject_set_id) subjects = [] for proto_subject in proto_subjects: subject = Subject() subject.links.project = project subject.add_location(proto_subject['location_lc']) subject.add_location(proto_subject['location_ps']) subject.metadata.update(proto_subject['metadata']) subject.save() subjects.append(subject) subject_set.add(subjects) workflow.add_subject_sets(subject_set) except Exception: self.log.exception("Error in create_subjects_and_link_to_project ")
def upload_chunks(self, destination, project_slug, set_prefix, zooniverse_login, zooniverse_pwd, batches=0, **kwargs): self.destination = destination metadata_location = os.path.join(self.destination, 'chunks.csv') try: self.chunks = pd.read_csv(metadata_location, index_col='index') except: raise Exception( "cannot read chunk metadata in {}. Check the --destination parameter, and make sure you have extracted chunks before." .format(metadata_location)) Panoptes.connect(username=zooniverse_login, password=zooniverse_pwd) zooniverse_project = Project.find(slug=project_slug) uploaded = 0 for batch, chunks in self.chunks.groupby('batch'): if chunks['uploaded'].all(): continue subjects_metadata = [] subject_set = SubjectSet() subject_set.links.project = zooniverse_project subject_set.display_name = "{}_batch_{}".format(set_prefix, batch) subject_set.save() subjects = [] _chunks = chunks.to_dict(orient='index') for chunk_index in _chunks: chunk = _chunks[chunk_index] print("uploading chunk {} ({},{}) in batch {}".format( chunk['recording'], chunk['onset'], chunk['offset'], batch)) subject = Subject() subject.links.project = zooniverse_project subject.add_location( os.path.join(self.destination, 'chunks', chunk['mp3'])) subject.metadata['date_extracted'] = chunk['date_extracted'] subject.save() subjects.append(subject) chunk['index'] = chunk_index chunk['zooniverse_id'] = subject.id chunk['project_slug'] = project_slug chunk['subject_set'] = str(subject_set.display_name) chunk['uploaded'] = True subjects_metadata.append(chunk) subject_set.add(subjects) self.chunks.update( pd.DataFrame(subjects_metadata).set_index('index')) self.chunks.to_csv(os.path.join(self.destination, 'chunks.csv')) uploaded += 1 if batches > 0 and uploaded >= batches: return
def main(): ap = argparse.ArgumentParser( description= 'Given a list of images, bins them into subject sets of size n') # require file path to read in images ap.add_argument('-f', '--filename', required=True, dest='filename', type=str, help='The name of the file from which to read the images') # optionally require subject set size; defaults to 1000 ap.add_argument( '-n', '--size', required=False, dest='n', type=int, default=1000, help='The maximum number of images a subject set should contain. \ The value should be between 1 and 10000, inclusive') # parse args into variables and check values args = vars(ap.parse_args()) filename = args['filename'] if args['filename'] else None n = args['n'] if args['n'] else None if not (n >= 1 and n <= 10000): raise ValueError('n must be between 1 and 10000, inclusive') # connect to zooniverse Panoptes.connect(username=zooniverse_config.Zooniverse_USERNAME, password=zooniverse_config.Zooniverse_PASS) project = Project.find(zooniverse_config.Project_ID) # connection to mongodb mongoConn = MongoClient(csh_db_config.DB_HOST + ":" + str(csh_db_config.DB_PORT)) cshTransDB = mongoConn[csh_db_config.TRANSCRIPTION_DB_NAME] cshTransDB.authenticate(csh_db_config.TRANSCRIPTION_DB_USER, csh_db_config.TRANSCRIPTION_DB_PASS) cshCollection = cshTransDB[csh_db_config.TRANS_DB_MeetingMinColl] # track subject sets being created subjectSets = [] # get the image filenames in a Python list with open(filename) as handle: filenames = handle.readlines() # divide files into groups of n filegroups = list([e for e in t if e != None] for t in itertools.zip_longest(*([iter(filenames)] * n))) for group in filegroups: displayName = '{:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now()) # create a new subject set subjectSet = SubjectSet() subjectSet.links.project = project subjectSet.display_name = displayName subjectSet.save() subjectSetId = subjectSet.id subjectSets.append(subjectSetId) # create a new subject for each file and add to the subject set for filename in group: # remove trailing '\n' character filename = filename.rstrip() # create a new subject subject = Subject() subject.links.project = project filepath = cshCollection.find_one({'_id': filename})['file']['anonPath'] subject.add_location(filepath) subject.metadata['ID'] = filename subject.save() # add to subject set subjectSet.add(subject) # retrieve and update the record from mongodb updateQuery = { '$set': { 'canCrowdsource': True, 'transcription': { 'numClassifications': 5, 'subjectSetId': subjectSetId, 'status': 'sent' } } } record = cshCollection.find_one_and_update({'_id': filename}, updateQuery) # add subject sets to the workflow workflow = project.links.workflows[0] workflow.add_subject_sets(subjectSets) # print helpful information to the console print('{} subject sets created with the following IDs: {}'.format( len(subjectSets), subjectSets))
images = [a['src'] for a in soup.find_all("img", {"src": re.compile("gstatic.com")})] #print images for img in images: raw_img = urllib2.urlopen(img).read() #add the directory for your image here DIR="images/" cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1 f = open(DIR + image_type + "_"+ str(cntr)+".jpg", 'wb') f.write(raw_img) f.close() print 'Creating image set...' # create the subject set. subject_set = SubjectSet() subject_set.links.project = p subject_set.display_name = "Images of " + thing + '\'s' subject_set.save() print 'Uploading images to Zooniverse...' # add all images to subject set for i in range(1,21): subject = Subject() subject.links.project = p subject.add_location('images/' + str(thing) + '_' + str(i)+'.jpg') subject.save() subject_set.add(subject) print 'Complete.'
retry = input('Enter "y" to try again, any other key to exit' + '\n') if retry.lower() != 'y': quit() # get new subject name new_set_name = input('Enter a name for the subject set to use or create:' + '\n') # find or build destination subject set try: # check if the subject set already exits subject_set_new = SubjectSet.where(project_id=proj.id, display_name=new_set_name).next() except: # create a new subject set for the new data and link it to the project above subject_set_new = SubjectSet() subject_set_new.links.project = proj subject_set_new.display_name = new_set_name subject_set_new.save() # iterate through the subjects linking them and verifying they link. k = 0 for sub in add_subjects: try: subject_set_new.add(sub) print(sub, 'linked to new set') k += 1 except: print(sub, 'previously linked or did not link correctly') print(k, ' subjects linked to subject set ', new_set_name, ' in project ', proj_id) linked = 0 with open(os.getcwd() + os.sep + 'copied_subjects.csv', 'wt') as file:
m += 1 # catch and process the last aggregated group subjects_to_add = process_aggregation(subject, m, workflow_id, workflow_version, bin_1, subjects_to_add) if step_to_analyse == 'Q4': proj = Project.find(slug='tedcheese/whales-as-individuals') try: # check if the subject set already exits subject_set = SubjectSet.where(project_id=proj.id, display_name=set_name).next() print("Add subjects to subject set: {}.".format(subject_set.display_name)) except: # create a new subject set for the new data and link it to the project above subject_set = SubjectSet() subject_set.links.project = proj subject_set.display_name = set_name subject_set.save() print("Created a new subject set with id: {}.".format(subject_set.id)) linked_subjects = set() # use sets to automatically do inclusion test with open(subject_location) as sub_file: r = csv.DictReader(sub_file) for sub_row in r: if sub_row['subject_set_id'] == subject_set.id: linked_subjects |= {sub_row['subject_id']} add_subjects = (subjects_to_add - linked_subjects) print("Adding {} subjects to the subject set".format(len(add_subjects))) k = 0 # iterate through the subjects to advance verifying they load (for now) may use a list later.
'Title': item_title } segments.append(segment) print('Item segments transformation complete.') return segments segments = transform_item_segments('https://www.loc.gov/item/' + LIBRARY_OF_CONGRESS_ITEM_ID) Panoptes.connect(username=USERNAME, password=PASSWORD, endpoint=ENDPOINT) project = Project.find(PROJECT) subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = segments[0]['metadata']['Title'] # uses item Title as default subject set name, or feel free to hardcode subject_set.save() print('Begin Zooniverse subject upload...') for segment in segments: subject = Subject() subject.links.project = project subject.add_location(segment['location']) subject.metadata.update(segment['metadata']) subject.save() subject_set.add(subject) print("Zooniverse subject upload complete.")
# El proyecto "Sky Sounds" tiene asociado el identificador 13586. project = Project('13586') # ------- Subject set de imágenes ------- # Conexión con el subject set correspondiente o creación de uno nuevo en caso # de que este no exista. try: # Comprueba si existe el subject set. subject_set = SubjectSet.where(project_id=project.id, display_name=image_set_name).next() except StopIteration: # Crea un nuevo subject set para los nuevos datos y lo asocia al proyecto. subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = image_set_name subject_set.save() # Adicción de las muestras al subject set. with open(manifest_images_file, 'r') as mani_file: print('Uploading image_set') r = csv.DictReader(mani_file) for line in r: subject = Subject() subject.links.project = project subject.add_location(line['lc']) subject.add_location(line['sp']) subject.metadata['subject_id'] = line['id'] subject.save() subject_set.add(subject.id)
def create(project_id, display_name): subject_set = SubjectSet() subject_set.links.project = project_id subject_set.display_name = display_name subject_set.save() echo_subject_set(subject_set)
def upload_chunks(self, chunks: str, project_id: int, set_name: str, zooniverse_login="", zooniverse_pwd="", amount: int = 1000, ignore_errors: bool = False, **kwargs): """Uploads ``amount`` audio chunks from the CSV dataframe `chunks` to a zooniverse project. :param chunks: path to the chunk CSV dataframe :type chunks: [type] :param project_id: zooniverse project id :type project_id: int :param set_name: name of the subject set :type set_name: str :param zooniverse_login: zooniverse login. If not specified, the program attempts to get it from the environment variable ``ZOONIVERSE_LOGIN`` instead, defaults to '' :type zooniverse_login: str, optional :param zooniverse_pwd: zooniverse password. If not specified, the program attempts to get it from the environment variable ``ZOONIVERSE_PWD`` instead, defaults to '' :type zooniverse_pwd: str, optional :param amount: amount of chunks to upload, defaults to 0 :type amount: int, optional """ self.chunks_file = chunks self.get_credentials(zooniverse_login, zooniverse_pwd) metadata_location = os.path.join(self.chunks_file) try: self.chunks = pd.read_csv(metadata_location, index_col="index") except: raise Exception("cannot read chunk metadata from {}.".format( metadata_location)) assert_dataframe("chunks", self.chunks) assert_columns_presence( "chunks", self.chunks, {"recording_filename", "onset", "offset", "uploaded", "mp3"}, ) from panoptes_client import Panoptes, Project, Subject, SubjectSet Panoptes.connect(username=self.zooniverse_login, password=self.zooniverse_pwd) zooniverse_project = Project(project_id) subjects_metadata = [] uploaded = 0 subject_set = None for ss in zooniverse_project.links.subject_sets: if ss.display_name == set_name: subject_set = ss if subject_set is None: subject_set = SubjectSet() subject_set.links.project = zooniverse_project subject_set.display_name = set_name subject_set.save() subjects = [] chunks_to_upload = self.chunks[self.chunks["uploaded"] == False].head( amount) chunks_to_upload = chunks_to_upload.to_dict(orient="index") if len(chunks_to_upload) == 0: print("nothing left to upload.") return for chunk_index in chunks_to_upload: chunk = chunks_to_upload[chunk_index] print("uploading chunk {} ({},{})".format( chunk["recording_filename"], chunk["onset"], chunk["offset"])) subject = Subject() subject.links.project = zooniverse_project subject.add_location( os.path.join(os.path.dirname(self.chunks_file), "chunks", chunk["mp3"])) subject.metadata["date_extracted"] = chunk["date_extracted"] try: subject.save() except Exception as e: print("failed to save chunk {}. an exception has occured:\n{}". format(chunk_index, str(e))) print(traceback.format_exc()) if args.ignore_errors: continue else: print("subject upload halting here.") break subjects.append(subject) chunk["index"] = chunk_index chunk["zooniverse_id"] = str(subject.id) chunk["project_id"] = str(project_id) chunk["subject_set"] = str(subject_set.display_name) chunk["uploaded"] = True subjects_metadata.append(chunk) if len(subjects) == 0: return subject_set.add(subjects) self.chunks.update(pd.DataFrame(subjects_metadata).set_index("index")) self.chunks.to_csv(self.chunks_file)
try: Panoptes.connect(username=zcfg.login['user'], password=zcfg.login['pass']) project = Project.find("6307") except Exception as e: f = open(logfile, "a") t = time.localtime() f.write('Unable to connect to Zooniverse: '+time.strftime("%D:%H:%M:%S",t)+'\n') f.close() subject_set = SubjectSet() s = Subject() subject_set.links.project = project subject_set.display_name = 'Tutorial subject set 2' images = glob.glob(path) new_subjects = [] for img in images: try: s = Subject() s.links.project = project # manifest file if os.path.splitext(img)[1] == ".csv": # upload manifest info.... not sure how this will be set up after second step # move csv to complete images folder shutil.copy(f, completed_images) # make dict out of csv file for upload manifest = csv.DictReader(open(img)) s.metadata.update(manifest)
def create_subjects_and_link_to_project(proto_subjects, project_id, subject_set_id, subject_set_name=None): ''' find the project and relevant subject set. Get the existing subject data and compare to the new proto_subjects. Upload any instances of nbew subjects to the project Keyword Arguments: proto_subjects -- dictionary structure containing subject filepath+filename, and associated metadata project_id -- identifier to find and link with the project subject_set_id -- identifier for the subject set of interest ''' # get the project object project = Project.find(project_id) # set up subject_set if subject_set_id == None: subject_set = SubjectSet() # create empty subject_set subject_set.links.project = project if subject_set_name == None: # if not defined generate a random subject set name to avoid error when a set already exists subject_set_name = 'subject_set_{:02d}_{:02d}_{:04d}_{}'.format( date.day, date.month, date.year, ''.join(generate_random_str())) print("will create a subject set called: {}".format(subject_set_name)) subject_set.display_name = subject_set_name # set the name of the subject set subject_set.save() project.reload() else: subject_set = SubjectSet().find( subject_set_id) # find the existing subject_set existing_subject_set_name = subject_set.display_name # get its name # if you have tried to set the subject set name, check that it matches the name for the chosen subject set id if (subject_set_name != None) and (existing_subject_set_name != subject_set_name): print( "your chosen subject set name does not match the existing name: {}, {}" .format(subject_set_name, existing_subject_set_name)) return -1 else: subject_set_name = existing_subject_set_name print("add to existing subject set: {}".format(subject_set_name)) # Create a list of the existing subject metadata meta_list = [] print("existing subjects:") for subject in subject_set.subjects: print(subject.id, subject.metadata) meta_list.append(subject.metadata) # When making list of subjects to add, check to see if the metadata of the subject you want to add is already in the set print("new subjects:") new_subjects = [] for filename, metadata in proto_subjects.items(): # check if this subject is already in the subject set if np.isin(metadata, meta_list): print("{}, subject already in set".format(metadata)) # In this case we skip over the subject that already exists. # N.B. you may want to remove an existing subject and update it with the new one continue # Otherwise we can add the subject to the new subject list else: subject = Subject() subject.links.project = project subject.add_location(filename) subject.metadata.update(metadata) subject.save() new_subjects.append(subject) print("{}, new subject add to list".format(metadata)) print("new subjects to add: {}".format(new_subjects)) # add the new subject list (data and metadata) to the already defined project subject set subject_set.add(new_subjects) return
def upload_manifest_to_galaxy_zoo( subject_set_name, manifest, project_id='5733', # default to main GZ project login_loc='zooniverse_login.txt'): """ Save manifest (set of galaxies with metadata prepared) to Galaxy Zoo Args: subject_set_name (str): name for subject set manifest (list): containing dicts of form {png_loc: img.png, key_data: {metadata_col: metadata_value}} project_id (str): panoptes project id e.g. '5733' for Galaxy Zoo, '6490' for mobile n_processes (int): number of processes with which to upload galaxies in parallel Returns: None """ assert os.path.exists(login_loc) if 'TEST' in subject_set_name: logging.warning('Testing mode detected - not uploading!') return manifest if project_id == '5733': logging.info('Uploading to Galaxy Zoo project 5733') elif project_id == '6490': logging.info('Uploading to mobile app project 6490') elif project_id == '8751': logging.info('Uploading to staging project 8751') else: logging.info('Uploading to unknown project {}'.format(project_id)) # Important - don't commit the password! zooniverse_login = read_data_from_txt(login_loc) Panoptes.connect(**zooniverse_login) project = Project.find(project_id) # check if subject set already exists subject_set = None subject_sets = SubjectSet.where(project_id=project_id) for candidate_subject_set in subject_sets: if candidate_subject_set.raw['display_name'] == subject_set_name: # use if it already exists subject_set = candidate_subject_set if not subject_set: # make a new one if not subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = subject_set_name subject_set.save() pbar = tqdm(total=len(manifest), unit=' subjects uploaded') save_subject_params = {'project': project, 'pbar': pbar} save_subject_partial = functools.partial(save_subject, **save_subject_params) # upload in async blocks, to avoid huge join at end manifest_block_start = 0 manifest_block_size = 100 while True: manifest_block = manifest[manifest_block_start:manifest_block_start + manifest_block_size] new_subjects = [] with Subject.async_saves(): for manifest_entry in manifest_block: new_subjects.append(save_subject_partial(manifest_entry)) subject_set.add(new_subjects) logging.info('{} subjects linked'.format(new_subjects)) manifest_block_start += manifest_block_size if manifest_block_start > len(manifest): break return manifest # for debugging only
def create_subject_set(project_id: int, name: str): subject_set = SubjectSet() subject_set.links.project = Project(project_id) subject_set.display_name = name subject_set.save() return subject_set