images = [a['src'] for a in soup.find_all("img", {"src": re.compile("gstatic.com")})] #print images for img in images: raw_img = urllib2.urlopen(img).read() #add the directory for your image here DIR="images/" cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1 f = open(DIR + image_type + "_"+ str(cntr)+".jpg", 'wb') f.write(raw_img) f.close() print 'Creating image set...' # create the subject set. subject_set = SubjectSet() subject_set.links.project = p subject_set.display_name = "Images of " + thing + '\'s' subject_set.save() print 'Uploading images to Zooniverse...' # add all images to subject set for i in range(1,21): subject = Subject() subject.links.project = p subject.add_location('images/' + str(thing) + '_' + str(i)+'.jpg') subject.save() subject_set.add(subject) print 'Complete.'
def upload_manifest_to_galaxy_zoo( subject_set_name, manifest, project_id='5733', # default to main GZ project login_loc='zooniverse_login.txt'): """ Save manifest (set of galaxies with metadata prepared) to Galaxy Zoo Args: subject_set_name (str): name for subject set manifest (list): containing dicts of form {png_loc: img.png, key_data: {metadata_col: metadata_value}} project_id (str): panoptes project id e.g. '5733' for Galaxy Zoo, '6490' for mobile n_processes (int): number of processes with which to upload galaxies in parallel Returns: None """ assert os.path.exists(login_loc) if 'TEST' in subject_set_name: logging.warning('Testing mode detected - not uploading!') return manifest if project_id == '5733': logging.info('Uploading to Galaxy Zoo project 5733') elif project_id == '6490': logging.info('Uploading to mobile app project 6490') elif project_id == '8751': logging.info('Uploading to staging project 8751') else: logging.info('Uploading to unknown project {}'.format(project_id)) # Important - don't commit the password! zooniverse_login = read_data_from_txt(login_loc) Panoptes.connect(**zooniverse_login) project = Project.find(project_id) # check if subject set already exists subject_set = None subject_sets = SubjectSet.where(project_id=project_id) for candidate_subject_set in subject_sets: if candidate_subject_set.raw['display_name'] == subject_set_name: # use if it already exists subject_set = candidate_subject_set if not subject_set: # make a new one if not subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = subject_set_name subject_set.save() pbar = tqdm(total=len(manifest), unit=' subjects uploaded') save_subject_params = {'project': project, 'pbar': pbar} save_subject_partial = functools.partial(save_subject, **save_subject_params) # upload in async blocks, to avoid huge join at end manifest_block_start = 0 manifest_block_size = 100 while True: manifest_block = manifest[manifest_block_start:manifest_block_start + manifest_block_size] new_subjects = [] with Subject.async_saves(): for manifest_entry in manifest_block: new_subjects.append(save_subject_partial(manifest_entry)) subject_set.add(new_subjects) logging.info('{} subjects linked'.format(new_subjects)) manifest_block_start += manifest_block_size if manifest_block_start > len(manifest): break return manifest # for debugging only
subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = set_name subject_set.save() print 'Uploading subjects, this could take a while!' new_subjects = 0 for filename, metadata in subject_metadata.items(): try: if filename not in previous_subjects: subject = Subject() subject.links.project = project subject.add_location(location + os.sep + filename) subject.metadata.update(metadata) subject.save() print filename subject_set.add(subject.id) new_subjects += 1 except panoptes_client.panoptes.PanoptesAPIException: print 'An error occurred during the upload of ', filename print new_subjects, 'new subjects created and uploaded' uploaded = 0 with open(location + os.sep + 'Uploaded subjects.csv', 'wt') as file_up: subject_set = SubjectSet.where(project_id=project.id, display_name=set_name).next() for subject in subject_set.subjects: uploaded += 1 file_up.write(subject.id + ',' + (subject.metadata.values())[0] + '\n') print uploaded, ' subjects found in the subject set, see the full list in Uploaded subjects.csv.'
def main(): ap = argparse.ArgumentParser( description= 'Given a list of images, bins them into subject sets of size n') # require file path to read in images ap.add_argument('-f', '--filename', required=True, dest='filename', type=str, help='The name of the file from which to read the images') # optionally require subject set size; defaults to 1000 ap.add_argument( '-n', '--size', required=False, dest='n', type=int, default=1000, help='The maximum number of images a subject set should contain. \ The value should be between 1 and 10000, inclusive') # parse args into variables and check values args = vars(ap.parse_args()) filename = args['filename'] if args['filename'] else None n = args['n'] if args['n'] else None if not (n >= 1 and n <= 10000): raise ValueError('n must be between 1 and 10000, inclusive') # connect to zooniverse Panoptes.connect(username=zooniverse_config.Zooniverse_USERNAME, password=zooniverse_config.Zooniverse_PASS) project = Project.find(zooniverse_config.Project_ID) # connection to mongodb mongoConn = MongoClient(csh_db_config.DB_HOST + ":" + str(csh_db_config.DB_PORT)) cshTransDB = mongoConn[csh_db_config.TRANSCRIPTION_DB_NAME] cshTransDB.authenticate(csh_db_config.TRANSCRIPTION_DB_USER, csh_db_config.TRANSCRIPTION_DB_PASS) cshCollection = cshTransDB[csh_db_config.TRANS_DB_MeetingMinColl] # track subject sets being created subjectSets = [] # get the image filenames in a Python list with open(filename) as handle: filenames = handle.readlines() # divide files into groups of n filegroups = list([e for e in t if e != None] for t in itertools.zip_longest(*([iter(filenames)] * n))) for group in filegroups: displayName = '{:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now()) # create a new subject set subjectSet = SubjectSet() subjectSet.links.project = project subjectSet.display_name = displayName subjectSet.save() subjectSetId = subjectSet.id subjectSets.append(subjectSetId) # create a new subject for each file and add to the subject set for filename in group: # remove trailing '\n' character filename = filename.rstrip() # create a new subject subject = Subject() subject.links.project = project filepath = cshCollection.find_one({'_id': filename})['file']['anonPath'] subject.add_location(filepath) subject.metadata['ID'] = filename subject.save() # add to subject set subjectSet.add(subject) # retrieve and update the record from mongodb updateQuery = { '$set': { 'canCrowdsource': True, 'transcription': { 'numClassifications': 5, 'subjectSetId': subjectSetId, 'status': 'sent' } } } record = cshCollection.find_one_and_update({'_id': filename}, updateQuery) # add subject sets to the workflow workflow = project.links.workflows[0] workflow.add_subject_sets(subjectSets) # print helpful information to the console print('{} subject sets created with the following IDs: {}'.format( len(subjectSets), subjectSets))