def load_train_and_test(self, train, test): if isinstance(train, pd.DataFrame) and isinstance(test, pd.DataFrame): return train, test trainextension, testextension = get_extension(train.basename()), get_extension(test.basename()) trainloadf, testloadf = getattr(pd, 'read_' + trainextension), getattr(pd, 'read_' + testextension) train: pd.DataFrame = trainloadf(train) test: pd.DataFrame = testloadf(test) return train, test
def process_methods(self, diff_only=False, changes=[]): """ Main method that documents methods in a file. To any file that needs to be documented. Process methods is the entry point for getting the whole thing done Parameters ---------- bool diff_only: Use a diff only. Consumed by dyc diff. list changes: Changes in a file, mainly use also with dyc diff. """ print('\nProcessing Methods\n\r') for filename in self.file_list: try: change = filter(lambda x: x.get('path') == filename, changes)[0] except: change = None extension = get_extension(filename) fmt = self.formats.get(extension) method_cnf = fmt.get('method', {}) method_cnf['arguments'] = fmt.get('arguments') builder = MethodBuilder(filename, method_cnf) builder.initialize(change=change) builder.prompts() builder.apply() builder.clear(filename)
def check_news(item): '''check_news(item): Performs convalidation on the data loaded from load_news() and raises ValueError if something is wrong. ''' if not item['title']: raise ValueError(u'Impossibile caricare una notizia senza titolo') if not item['content']: raise ValueError(u'Impossibile caricare una notizia senza testo') # datepick_to_datetime() itself will raise the ValueError in case item['date'] = datepick_to_datetime(item['date']) for p in item['photos']: if re.match('^image/[A-Za-z]*', p[1].mimetype) and p[0] == '': # If I have a photo without the label raise ValueError( u"Impossibile caricare foto senza una descrizione." ) # It is possible to UPDATE the label of an existing photo (so a pair label-nophoto is allowed), # while it is impossible to load a photo without a label, because the label is always rendered in the page # and thus loaded with the photo. In case of UPLOAD I must check again and raise errors for every unpaired label. if not allowed_pic(p[1].filename): raise ValueError( u'''{0} non può essere caricato:<br> '*.{1}' non è tra le estensioni ammesse (ovvero {2}).<br> Ricarica le foto. '''.format(p[1].filename, get_extension(p[1].filename), set_to_string(ALLOWED_EXTENSIONS_PICS)) ) # Secondary issue: now the user have to reload all the photos. # It is possible to let him see again what they loaded, and correct? return
def parse_json(self,data): ipdata = json.loads(data) try: if ipdata['imgs']: for n in ipdata['imgs']: #data子项 if n['objURL']: try: proxy_support = urllib2.ProxyHandler(proxy) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) #print "proxy",proxy self.lock() self.dbcurr.execute('select ID from pic_info where objURL=%s', (n['objURL'])) y = self.dbcurr.fetchone() #print "y=",y if y: print "database exist" self.unlock() #continue 前解锁 continue else: real_extension=utils.get_extension(n['objURL']) req = urllib2.Request(n['objURL'],headers=i_headers) resp = urllib2.urlopen(req,None,5) dataimg=resp.read() name=str(uuid.uuid1()) filename="" if len(real_extension)>4: real_extension=".gif" real_extension=real_extension.lower() if real_extension==".gif": filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension self.count+=1 else: filename =self.makeDateFolder("E://sosogif", "o"+str(self.count % 20))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension self.count+=1 """ name=str(uuid.uuid1()) filename="" if len(real_extension)>4: real_extension=".gif" filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension self.count+=1 """ try: if not os.path.exists(filename): file_object = open(filename,'w+b') file_object.write(dataimg) file_object.close() self.anaylis_info(n,filename,real_extension) #入库操作 else: print "file exist" except IOError,e1: print "e1=",e1 pass self.unlock() except IOError,e2: #print "e2=",e2 pass self.chance1+=1
def context_iter(self, context=None): """ Iterate the tree depth-first, producing a context for each node. Args: context (dict): The parent context object Yields: dict: The context object for this node """ if not context: context = {'parent_container_type': None} # Depth-first walk down tree context['container_type'] = self.type context[self.type] = self # Bring subject to top-level of context if self.type == 'session': context['subject'] = self.data['subject'] # Additionally bring ext up if file if self.type == 'file': context['ext'] = utils.get_extension(self.data['name']) # Yield the current context before processing children yield context context['parent_container_type'] = self.type for child in self.children: context_copy = context.copy() for ctx in child.context_iter(context_copy): yield ctx
def remove_unique_indicators(path): print path for filename in os.listdir(path): if ' (2)' in filename: index = filename.index(' (2)') extension = utils.get_extension(filename, with_dot=True) shorter_filename = filename[:index] + extension print filename os.rename(os.path.join(path, filename), os.path.join(path, shorter_filename))
def find_train_test(): where_train = Question("Where is your train dataset under project root?", 'data/raw/train.csv') where_test = Question("Where is your test dataset under project root?", 'data/raw/test.csv') train_path = where_train.ask() test_path = where_test.ask() is_csv = get_extension(train_path) == 'csv' if is_csv: what_sep = Question('Specify separator into csv file:', ',') csv_sep = what_sep.ask() train, test = pd.read_csv(train_path, csv_sep), pd.read_csv(test_path, csv_sep) else: load_f = getattr(pd, 'read_' + get_extension(train_path)) train = load_f(train_path) load_f = getattr(pd, 'read_' + get_extension(test_path)) test = load_f(test_path) return train, test
def main(): parser = argparse.ArgumentParser(description='Generate average atlas for an image folder.') parser.add_argument('--in_folder', type=str, help='The input image folder') parser.add_argument('--out', type=str, help='The output image path (with .nii.gz)') parser.add_argument('--ref', type=str, help='Path of reference image. Define the affine and header of output nii.gz') parser.add_argument('--num_processes', type=int, default=20) args = parser.parse_args() file_list_all = os.listdir(args.in_folder) print('Process images under folder: ', args.in_folder) print('Number of files in folder %s is %d' % (args.in_folder, len(file_list_all))) nifti_file_list = [file_path for file_path in file_list_all if get_extension(file_path) == '.gz'] print('Number of nii.gz files: ', len(nifti_file_list)) file_name_chunks = get_chunks_list(nifti_file_list, args.num_processes) pool = Pool(processes=args.num_processes) result_list = [pool.apply_async(average_nii_file_list_mem, (file_name_chunk, args.in_folder)) for file_name_chunk in file_name_chunks] # Get the shape. # im_temp = nib.load(os.path.join(args.in_folder, nifti_file_list[0])) im_temp = nib.load(args.ref) im_header = im_temp.header im_affine = im_temp.affine im_temp_data = im_temp.get_data() im_shape = im_temp_data.shape averaged_image = np.zeros(im_shape) for thread_idx in range(len(result_list)): result = result_list[thread_idx] result.wait() print(f'Thread with idx {thread_idx} / {len(result_list)} is completed') print('Adding to averaged_image...') averaged_image_chunk = result.get() chunk_size = len(file_name_chunks[thread_idx]) averaged_image = np.add(averaged_image, np.multiply(averaged_image_chunk, chunk_size)) print('Done.') print('') print('Averaging over all images...') averaged_image = np.divide(averaged_image, len(nifti_file_list)) print('Done.') print('Output to file: ', args.out) averaged_image_obj = nib.Nifti1Image(averaged_image, affine=im_affine, header=im_header) nib.save(averaged_image_obj, args.out)
def sort_files(path, filetype='gif'): """ Sort files into a specific folder while *not* maintaining existing file structure patterns. """ target_dir = "_{}s".format(filetype.lower()) target = os.path.join(path, target_dir) for root, dirnames, filenames in os.walk(path): for filename in filenames: if ".ds_store" in filename.lower(): continue extension = utils.get_extension(filename).strip('.') if extension == filetype.lower(): utils.make_dir(target) new_name = utils.find_untaken_name(filename, target) os.rename(os.path.join(root, filename), os.path.join(target, new_name))
def get_target_file_path(url, file_title, subreddit_target_dir, subfolder=None, new_only=False): file_extension = utils.get_extension(url) """ if not utils.has_acceptable_extension(url): print "Not an accepted extension." continue """ file_title = u"{}.{}".format(file_title, file_extension) clean_file_title = strip_rank_from_title(file_title) file_path = os.path.join(subreddit_target_dir, file_title) if subfolder: dir_path = os.path.join(subreddit_target_dir, subfolder) file_path = os.path.join(dir_path, file_title) make_dirs(dir_path) if os.path.isfile(file_path): print u"\"{}\" already exists.".format(file_title) return False print "Pulling {} ...".format(url), return file_path
def process(host, path, callback): path, workers = _build_workers(path) source_file = get_file(host + path) if get_extension(path) == 'svg': # http://redmine.pearbox.net/issues/1605 source_image_type = 'svg+xml' else: image = get_image(source_file) exif = image.info.get('exif', b'') source_image_type = image.format.upper() for worker in workers: #extract palette from Image pl = image.getpalette() image = worker.do(image) if pl is not None: #if image has palette then restore it image.putpalette(pl) source_file = StringIO() image.save(source_file, source_image_type, exif=exif) data = source_file.getvalue() data_len = len(data) logging.info( 'Image was successful processed with type {type} and len {data_len}'. format(type=source_image_type, data_len=data_len)) callback('200 OK', [('Content-type', 'image/{type}'.format(type=source_image_type)), ('Content-length', str(data_len))]) return [data]
async def create_meme_tempfile(imgpath, text, text_top=None): ext = utils.get_extension(imgpath) temp = tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) await create_meme(imgpath, temp.name, text, text_top) return temp.name
def parse_json(self, data): ipdata = json.loads(data) try: if ipdata['imgs']: for n in ipdata['imgs']: #data子项 if n['objURL']: try: proxy_support = urllib2.ProxyHandler(proxy) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) #print "proxy",proxy self.lock() self.dbcurr.execute( 'select ID from pic_info where objURL=%s', (n['objURL'])) y = self.dbcurr.fetchone() #print "y=",y if y: print "database exist" self.unlock() #continue 前解锁 continue else: real_extension = utils.get_extension( n['objURL']) req = urllib2.Request(n['objURL'], headers=i_headers) resp = urllib2.urlopen(req, None, 5) dataimg = resp.read() name = str(uuid.uuid1()) filename = "" if len(real_extension) > 4: real_extension = ".gif" real_extension = real_extension.lower() if real_extension == ".gif": filename = self.makeDateFolder( "E://sosogif", "d" + str(self.count % 60) ) + "//" + name + "-www.sosogif.com-搜搜gif贡献" + real_extension self.count += 1 else: filename = self.makeDateFolder( "E://sosogif", "o" + str(self.count % 20) ) + "//" + name + "-www.sosogif.com-搜搜gif贡献" + real_extension self.count += 1 """ name=str(uuid.uuid1()) filename="" if len(real_extension)>4: real_extension=".gif" filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension self.count+=1 """ try: if not os.path.exists(filename): file_object = open(filename, 'w+b') file_object.write(dataimg) file_object.close() self.anaylis_info( n, filename, real_extension) #入库操作 else: print "file exist" except IOError, e1: print "e1=", e1 pass self.unlock() except IOError, e2: #print "e2=",e2 pass self.chance1 += 1
def update_news(request, cursor, app, id): '''update_news(request, cursor, app, id): This function updates a news, meaning that it can identify and overwrite a specific row in the database. Basically the same as the above for the text parts, but slightly different concerning the management of pictures. NB: it DOESN'T DEAL with the ValueErrors. The caller is supposed to manage them. ''' item = load_news(request) check_news(item) old_pics = retrieve_item("news", id, cursor)['pics'] try: for n in xrange(len(item['photos'])): if old_pics[n][1] != item['photos'][n][0]: # If labels are different, update them old_pics[n][1] = item['photos'][n][0] if item['photos'][n][1].filename != '': # If I have a new file: # Save it filename = 'File{0}.{1}'.format(str(datetime.datetime.now()).translate(None, '.:- ')[:-3], get_extension(secure_filename(item['photos'][n][1].filename))) item['photos'][n][1].save(os.path.join(app.config['UPLOAD_FOLDER_PICS'], filename)) # Delete the old one try: os.remove(os.path.join(BASE_PATH, app.config['UPLOAD_FOLDER_PICS'], old_pics[n][2])) except OSError as e: # If the file there isn't, I simply leave the corrupted one (if exists) orphan. app.logger.error('OSError occurred in update_NEWS, probly orphan file. Error Code: {0}'.format(e)) # Overwrite te name of the file in the database entry old_pics[n][2] = filename except IndexError: # Means that I'm trying to update one more photo, than what I have in old_pics (I'm adding a photo) filename = 'File{0}.{1}'.format(str(datetime.datetime.now()).translate(None, '.:- ')[:-3], get_extension(secure_filename(item['photos'][n][1].filename))) item['photos'][n][1].save(os.path.join(app.config['UPLOAD_FOLDER_PICS'], filename)) old_pics.append( (n, item['photos'][n][0], filename) ) item['pics'] = old_pics cursor.execute("UPDATE news SET data=?, title=?, text=?, pics=? WHERE id = ?", [item['date'], item['title'], item['content'], json.dumps(item['pics']), id]) return item
def upload_doc(request, cursor, app): '''upload_doc(request, cursor, app): This function performs a fresh upload of all the data previously loaded and checked. It adds a new row in the database without overwriting anything. In case of failure, it returns all the non-checked raw loaded data, to be displayed again to the user and let they correct it. NB: it DOESN'T DEAL with the ValueErrors. The caller is supposed to manage them. ''' item = load_doc(request) check_doc(item) filename = 'File{0}.{1}'.format(str(datetime.datetime.now()).translate(None, '.:- ')[:-3], get_extension(secure_filename(item['file'].filename))) item['file'].save(os.path.join(app.config['UPLOAD_FOLDER_DOCS'], filename)) cursor.execute("INSERT INTO docs (name, path) VALUES (?, ?)", [item['title'], filename]) return
from datetime import datetime from collections import defaultdict from bokeh.plotting import figure from bokeh.io import export_svgs from multiprocessing import Pool def date_to_month(d): return datetime(d.year, d.month, 1) base_dir = "/home/visgean/Dropbox/**/*" picture_extensions = ['.jpg', '.jpeg', '.png'] pictures = list(filter( lambda f: utils.get_extension(f) in picture_extensions, glob.iglob(base_dir, recursive=True) )) with Pool(12) as p: exif_data = p.map(utils.get_exif, pictures) dates = p.map(utils.parse_date, exif_data) filesize_counter = defaultdict(int) image_counter = defaultdict(int) for filename, date in dates: if not date: continue month = date_to_month(date)
def update_doc(request, cursor, app, id): '''update_doc(request, cursor, app, id): This function updates a document, meaning that it can identify and overwrite a specific row in the database. NB #1: It doesn't call check_doc(), because the user may want to change only the label, without replacing the original file, and reverse. NB #2: it doesn't deal with the ValueErrors. The caller is supposed to manage them. ''' item = load_doc(request) if item['file']: old_file = retrieve_item('doc', id, cursor) # First of all I upload the new file filename = 'File{0}.{1}'.format(str(datetime.datetime.now()).translate(None, '.:- ')[:-3], get_extension(secure_filename(item['file'].filename))) item['file'].save(os.path.join(app.config['UPLOAD_FOLDER_DOCS'], filename)) cursor.execute("UPDATE docs SET path=? WHERE id = ?", [json.dumps(filename), id]) #Then I remove the old one try: os.remove(os.path.join(BASE_PATH, app.config['UPLOAD_FOLDER_DOCS'], old_file['path'])) except OSError as e: # If the file there isn't, I simply leave the corrupted one (if exists) orphan. app.logger.error('OSError in update_DOC, probly orphan file. Error Code: {0}'.format(e)) if item['title']: cursor.execute("UPDATE docs SET name=? WHERE id = ?", [item['title'], id]) else: raise ValueError(u'''Impossibile caricare un documento senza titolo.<br> Se il titolo non è stato caricato automaticamente, contatta il webmaster.''') return item
def upload_news(request, cursor, app): '''upload_news(request, cursor, app): This function performs a fresh upload of all the material previously loaded and checked (except for the unvalidation of unpaired labels, that is allowed for the uploading, but not allowed here.) It adds a new row in the database without overwriting anything. In case of failure, it returns all the non-checked raw loaded data, to be displayed again to the user and let they correct it. NB: it DOESN'T DEAL with the ValueErrors. The caller is supposed to manage them. ''' item = load_news(request) check_news(item) # Validation for l in item['photos']: if (not re.match('^image/[A-Za-z]*', l[1].mimetype) ) and l[0]!= None: raise ValueError( u"Impossibile caricare una descrizione senza la relativa foto." ) paths, labels = [], [] for f in item['photos']: filename = 'File{0}.{1}'.format(str(datetime.datetime.now()).translate(None, '.:- ')[:-3], get_extension(secure_filename(f[1].filename))) f[1].save(os.path.join(app.config['UPLOAD_FOLDER_PICS'], filename)) paths.append(filename) labels.append(f[0]) pics = zip(xrange(len(paths)), labels, paths) cursor.execute("INSERT INTO news (data, title, text, pics) VALUES (?, ?, ?, ?)", [item['date'], item['title'], item['content'], json.dumps(pics)]) return
def main(): # Parse arguments from command line parser = argparse.ArgumentParser( description='Anonymize a dataset using Mondrian in Spark.') parser.add_argument('METADATA', help='json file that describes the job.') parser.add_argument('WORKERS', default=4, type=int, help='Number of initial cuts (workers)') parser.add_argument('DEMO', default=0, type=int, help='Start tool in demo mode') parser.add_argument('TEST', default=0, type=int, help='Start tool in test mode') args = parser.parse_args() demo = args.DEMO test = args.TEST start_time = time.time() with open(args.METADATA) as fp: job = json.load(fp) # Create Spark Session spark = SparkSession \ .builder \ .appName('mondrian') \ .getOrCreate() spark.sparkContext.setLogLevel("WARN") # Enable Arrow-based columnar data transfers spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true') if demo == 1: print("\n[*] Spark context initialized") print("\tWait for 10 seconds to continue demo...") time.sleep(10) # Parameters filename_in = job['input'] filename_out = job['output'] # when repartition is not given it defaults to repartitionByRange if 'repartition' in job and \ job['repartition'] in {'customRepartition', 'repartitionByRange', 'noRepartition'}: repartition = job['repartition'] else: repartition = 'repartitionByRange' id_columns = job.get('id_columns', []) redact = job.get('redact', False) quasiid_columns = job['quasiid_columns'] sensitive_columns = job.get('sensitive_columns') # when column score is not given it defaults to span score_functions = { 'span': span, 'entropy': entropy, 'neg_entropy': neg_entropy } if 'column_score' in job and job['column_score'] in score_functions: column_score = score_functions[job['column_score']] else: column_score = span fragments = min(args.WORKERS, job.get('max_fragments', 10**6)) K = job.get('K') L = job.get('L') measures = job.get('measures', []) # Setup mondrian_fragmentation function mondrian = functools.partial(mondrian_fragmentation, sensitive_columns=sensitive_columns, is_valid=get_validation_function(K, L)) # when fraction is not given it defaults to None if 'fraction' in job and 0 < job['fraction'] < 1: fraction = job['fraction'] else: fraction = None # when fragmentation is not given it defaults to quantile_fragmentation fragmentation_functions = { 'mondrian': mondrian, 'quantile': quantile_fragmentation } if 'fragmentation' in job and \ job['fragmentation'] in fragmentation_functions: fragmentation = fragmentation_functions[job['fragmentation']] else: fragmentation = quantile_fragmentation if not K and not L: raise Exception("Both K and L parameters not given or equal to zero.") if L and not sensitive_columns: raise Exception( "l-diversity needs to know which columns are sensitive.") if fraction and fragmentation == mondrian: sys.exit('''Sorry, currently mondrian fregmentation criteria is only available without sampling.''') if demo == 1: print("\n[*] Job details initialized") print("\tWait for 10 seconds to continue demo...") time.sleep(10) print('\n[*] Using {} initial partitions\n'.format(fragments)) # Read file according to extension print('[*] Reading from {}\n'.format(filename_in)) extension = get_extension(filename_in) df = spark.read \ .options(header='true', inferSchema='true') \ .format(extension).load(filename_in) if fraction: df = df.sample(fraction=fraction) pdf = df.toPandas() pdf.info() print('\n[*] Fragmentation details\n') """ TODO: Avoid having a single node performing this step for the whole dataset """ if not fraction: # Create first cut pdf = create_fragments(df=pdf, quasiid_columns=quasiid_columns, column_score=column_score, fragments=fragments, colname='fragment', criteria=fragmentation) # Check first cut sizes = pdf.groupby('fragment').size() print("\n[*] Dataset distribution among fragments\n") print(sizes) print("\n[*] Dataset with fragmentation info\n") print(pdf.head) # Compute the range on the quasi-identifiers columns # will be useful for information loss evaluation quasiid_range = [-1] * len(quasiid_columns) for i, column in enumerate(quasiid_columns): quasiid_range[i] = span(pdf[column]) # Recreate the dataframe in a way that is appreciated by pyarrow. pdf = pd.DataFrame.from_dict(pdf.to_dict()) # Create spark dataframe df = spark.createDataFrame(pdf) else: # Compute quantiles on the sample column, bins = get_fragments_quantiles(df=pdf, quasiid_columns=quasiid_columns, column_score=column_score, fragments=fragments) # Read entire file in distributed manner df = spark.read \ .options(header='true', inferSchema='true').csv(filename_in) bins[0] = float( "-inf") # to prevent out of Bucketizer bounds exception bins[-1] = float( "inf") # to prevent out of Bucketizer bounds exception if len(bins) != 2: # split into buckets only if there are more than 1 bucketizer = Bucketizer(splits=bins, inputCol=column, outputCol='fragment') df = bucketizer.transform(df) else: # otherwise assign every row to bucket 0 df = df.withColumn('fragment', F.lit(0.0)) # Check first cut sizes = df.groupBy('fragment').count() print("\n[*] Dataset distribution among fragments\n") sizes.show() print("\n[*] Dataset with fragmentation info\n") df.show() # Compute the range on the quasi-identifiers columns # will be useful for information loss evaluation categoricals = [ item[0] for item in df.dtypes if item[0] in quasiid_columns and item[1].startswith('string') ] funcs = (F.countDistinct(F.col(cname)) if cname in categoricals else F.max(F.col(cname)) - F.min(F.col(cname)) for cname in quasiid_columns) quasiid_range = df.agg(*funcs).collect()[0] # Create a schema in which identifiers are either not there or strings # and quasi identifiers are strings. # This is needed because the result of the UDF has to generalize them. if not redact: schema = T.StructType( df.select([ column for column in df.columns if column not in id_columns ]).schema) else: schema = T.StructType(df.schema) for column in id_columns: schema[column].dataType = T.StringType() for column in quasiid_columns: schema[column].dataType = T.StringType() # TODO: add a column to the output schema to keep information on the # equivalent classes to avoid reconstructing them from scratch # in the evaluation of the metrics if demo == 1 and fragments > 1: print("\n[*] Dataset fragmented") print("\tWait for 10 seconds to continue demo...") time.sleep(10) # initialize taxonomies quasiid_gnrlz = __generalization_preproc(job, df, spark=spark) if demo == 1 and quasiid_gnrlz: print("\n[*] Taxonomies data preprocessed") print("\tWait for 10 seconds to continue demo...") time.sleep(10) # Create the pandas udf @F.pandas_udf(schema, F.PandasUDFType.GROUPED_MAP) def anonymize_udf(pdf): adf = anonymize(df=pdf, id_columns=id_columns, redact=redact, quasiid_columns=quasiid_columns, sensitive_columns=sensitive_columns, column_score=column_score, K=K, L=L, quasiid_gnrlz=quasiid_gnrlz) # Ensure that the quasi identifier columns have been converted # to strings (they are required by the return type). for column in quasiid_columns: adf[column] = adf[column].astype('object') return adf if repartition == 'repartitionByRange': df = df.repartitionByRange('fragment') elif repartition == 'customRepartition': df = repartition_dataframe(df, spark) print('\n[*] Starting anonymizing the dataframe\n') print('Number of DF partitions: {}'.format(df.rdd.getNumPartitions())) ''' Debug spark partitioning -> Low performance count = 0 for elem in df.rdd.glom().collect(): print("Size of Spark Partition {}: {}".format(count, len(elem))) count +=1 ''' adf = df \ .groupby('fragment') \ .applyInPandas(anonymize_udf.func, schema=anonymize_udf.returnType) \ .cache() # Create Discernability Penalty udf schema = T.StructType( [T.StructField('information_loss', T.LongType(), nullable=False)]) @F.pandas_udf(schema, F.PandasUDFType.GROUPED_MAP) def discernability_penalty_udf(adf): dp = discernability_penalty(adf=adf, quasiid_columns=quasiid_columns) # pandas_udf requires a pandas dataframe as output return pd.DataFrame({'information_loss': [dp]}) # Create Normalized Certainty Penalty udf schema = T.StructType( [T.StructField('information_loss', T.DoubleType(), nullable=False)]) @F.pandas_udf(schema, F.PandasUDFType.GROUPED_MAP) def normalized_certainty_penalty_udf(adf): gcp = normalized_certainty_penalty(adf=adf, quasiid_columns=quasiid_columns, quasiid_range=quasiid_range, quasiid_gnrlz=quasiid_gnrlz) # pandas_udf requires a pandas dataframe as output return pd.DataFrame({'information_loss': [gcp]}) if repartition == 'repartitionByRange': adf = adf.repartitionByRange('fragment') elif repartition == 'customRepartition': adf = repartition_dataframe(adf, spark) print('Number of ADF partitions: {}'.format(adf.rdd.getNumPartitions())) adf.drop('fragment').show(10) print('\n[*] Anonymized dataframe') if demo == 1: print("\tWait for 10 seconds to continue demo...\n") time.sleep(10) # dictionary to store test params measures_log = {} measures_log["fragments"] = fragments measures_log["repartition"] = repartition measures_log["K"] = K measures_log["L"] = L measures_log["fraction"] = fraction if measures: print('[*] Information loss evaluation\n') for measure in measures: if measure == 'discernability_penalty': dp = evaluate_information_loss(adf, discernability_penalty_udf) print(f"Discernability Penalty = {dp:.2E}") measures_log["DP"] = dp elif measure == 'normalized_certainty_penalty': ncp = evaluate_information_loss(adf, normalized_certainty_penalty_udf) print(f"Normalized Certainty Penalty = {ncp:.2E}") measures_log["NCP"] = ncp elif measure == 'global_certainty_penalty': gcp = evaluate_information_loss(adf, normalized_certainty_penalty_udf) gcp /= (len(quasiid_columns) * adf.count()) print(f"Global Certainty Penalty = {gcp:.4f}") measures_log["GCP"] = gcp # Remove fragmentation information adf = adf.drop('fragment') # Write file according to extension print(f"\n[*] Writing to {filename_out}\n") extension = get_extension(filename_out) adf.write \ .mode("overwrite") \ .options(header=True) \ .format(extension) \ .save(filename_out) end_time = time.time() execution_time = end_time - start_time measures_log["timestamp"] = end_time measures_log["time"] = execution_time if test == 1: # Write test params to Hadoop test_result_files = [ "hdfs://namenode:8020/anonymized/test_results.csv", "hdfs://namenode:8020/anonymized/artifact_result.csv" ] print("[*] Creating test configuration file on Hadoop") write_test_params(spark, measures_log, test_result_files) if demo == 0: print("--- %s seconds ---" % (execution_time)) spark.stop() print('\n[*] Done\n')
def startSpiderWap(self): if self.spider_queue.empty(): fetched_users = self.db.execute( 'SELECT * from spider_list ORDER BY weight DESC limit 0,30') if fetched_users <= 0: print 'nothing to spider,spider_list is empty' return False self.start = 'start' self.errno = ERR_NO fetchall = self.db.fetchall() # 将数据库中取出的待爬取的分享者,加入爬取队列 for item in fetchall: self.spider_queue.put({ 'sid': item[0], 'uk': item[1], 'file_fetched': item[2], 'follow_fetched': item[3], 'follow_done': item[4], 'file_done': item[5], 'weight': item[6], 'uid': item[7] }) self.got_follow_count = 0 self.got_files_count = 0 self.while_count = 0 while not self.spider_queue.empty(): self.while_count += 1 share_user = self.spider_queue.get() # 爬取分享者的文件列表 if not share_user['file_done']: print '%d now spidering file ,%d file fetched' % ( share_user['uk'], share_user['file_fetched']) rs = self.getShareListsWap(share_user['uk'], share_user['file_fetched']) if not rs: print 'uk:%d error to fetch files,try again later...' % share_user[ 'uk'] return True total_count, fetched_count, file_list = rs total_fetched = share_user['file_fetched'] + fetched_count print 'fetched_file_count:%d' % fetched_count if total_fetched >= total_count or total_count == 0: share_user['file_done'] = 1 # 该分享者所有文件爬取完成 if total_count == 0: self.db.execute( "UPDATE spider_list set file_done=%s WHERE sid=%s", (1, share_user['sid'])) self.db.commit() else: try: files_count = 0 for file in file_list: files_count += 1 ext = '' file_type = '' file_type_i = -1 if file['isdir'] == 0 and file[ 'feed_type'] == 'share': ext = utils.get_extension( file['title']).lower() file_type = utils.get_category(ext) file_type_i = self.file_type_t[file_type] time_stamp = int(time.time()) self.db.execute( "INSERT INTO share_file (title,uk,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (file['title'], file['uk'], file['shareid'], file['shorturl'], file['isdir'], file['size'], file['md5'], ext, file['feed_time'], time_stamp, file_type_i, share_user['uid'], file['feed_type'])) except: share_user['file_done'] = 0 self.db.rollback() traceback.print_exc() return False else: self.db.execute( "UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s", (total_fetched, share_user['file_done'], share_user['sid'])) self.db.execute( "UPDATE share_users set fetched=%s WHERE uid=%s", (total_fetched, share_user['uid'])) share_user['file_fetched'] = total_fetched self.got_files_count += files_count self.db.commit() # 爬取完文件后在爬取订阅列表,wap暂时不爬取 if share_user['follow_done'] == 0 and share_user['file_done'] == 1: share_user['follow_done'] = 1 print '删除用户:%d' % share_user['sid'] self.db.execute("DELETE FROM spider_list WHERE sid=%s", (share_user['sid'], )) self.db.commit() time.sleep(SPIDER_INTERVAL) print '-----------------Done------------------' print 'while_count:%d' % self.while_count print 'got_follow_count:%d' % self.got_follow_count print 'got_files_count:%d' % self.got_files_count return True
print("Directory " + folderName + " already exists") if not os.path.exists(fileFolderName): os.mkdir(fileFolderName) print("Directory " + fileFolderName + " Created ") else: print("Directory " + fileFolderName + " already exists") i = 0 for doc_id in doc_id_list: url_final = url_descarga_1 + doc_id + url_descarga_2 file_path = os.path.join( fileFolderName, date_name + "&" + number_list[i] + "&" + entry_number_list[i]) print("\n" + file_path) file_extension = utils.get_extension(url_final) file_complete_path = file_path + "." + file_extension if os.path.exists(file_complete_path): os.remove(file_complete_path) file_name = wget.download(url_final, file_complete_path) if (file_extension != "pdf"): utils.convert_to_pdf(file_name) i = i + 1 input("\nPress any key to close")
def main(): parser = argparse.ArgumentParser(description='Generate average atlas for an image folder.') parser.add_argument('--in_folder', type=str, help='The input image folder') parser.add_argument('--out_union', type=str, help='The output image path (with .nii.gz)') parser.add_argument('--out_inter', type=str, help='The output image path (with .nii.gz)', default='') parser.add_argument('--ref', type=str, help='Path of reference image. Define the affine and header of output nii.gz') parser.add_argument('--num_processes', type=int, default=10) args = parser.parse_args() file_list_all = os.listdir(args.in_folder) print('Process images under folder: ', args.in_folder) print('Number of files in folder %s is %d' % (args.in_folder, len(file_list_all))) nifti_file_list = [file_path for file_path in file_list_all if get_extension(file_path) == '.gz'] print('Number of nii.gz files: ', len(nifti_file_list)) file_name_chunks = get_chunks_list(nifti_file_list, args.num_processes) pool = Pool(processes=args.num_processes) # Get the shape. # im_temp = nib.load(os.path.join(args.in_folder, nifti_file_list[0])) im_temp = nib.load(args.ref) im_header = im_temp.header im_affine = im_temp.affine im_temp_data = im_temp.get_data() im_shape = im_temp_data.shape averaged_image_union = np.zeros(im_shape) averaged_image_inter = np.zeros(im_shape) averaged_image_union.fill(np.nan) # averaged_image_inter.fill(np.nan) non_null_mask_count_image = np.zeros(im_shape) if args.out_inter != '': print('Average in intersection:') image_average_inter_result_list = [pool.apply_async(sum_images_inter, (file_name_chunk, args.in_folder)) for file_name_chunk in file_name_chunks] for thread_idx in range(len(image_average_inter_result_list)): result = image_average_inter_result_list[thread_idx] result.wait() print(f'Thread with idx {thread_idx} / {len(image_average_inter_result_list)} is completed') print('Adding to averaged_image...') averaged_image_chunk = result.get() averaged_image_inter = add_image_inter(averaged_image_inter, averaged_image_chunk) print('Done.') averaged_image_inter = np.divide(averaged_image_inter, len(nifti_file_list), out=averaged_image_inter, where=np.logical_not(np.isnan(averaged_image_inter))) average_image_inter_obj = nib.Nifti1Image(averaged_image_inter, affine=im_affine, header=im_header) print(f'Saving to {args.out_inter}') nib.save(average_image_inter_obj, args.out_inter) print('Done.') print('') print('Average in union') image_average_union_result_list = [pool.apply_async(sum_images_union, (file_name_chunk, args.in_folder)) for file_name_chunk in file_name_chunks] for thread_idx in range(len(image_average_union_result_list)): result = image_average_union_result_list[thread_idx] result.wait() print(f'Thread with idx {thread_idx} / {len(image_average_union_result_list)} is completed') print('Adding to averaged_image...') averaged_image_chunk = result.get() averaged_image_union = add_image_union(averaged_image_union, averaged_image_chunk) print('Done.') non_null_mask_count_result = [pool.apply_async(sum_non_null_count, (file_name_chunk, args.in_folder)) for file_name_chunk in file_name_chunks] for thread_idx in range(len(non_null_mask_count_result)): result = non_null_mask_count_result[thread_idx] result.wait() print(f'Thread with idx {thread_idx} / {len(non_null_mask_count_result)} is completed') print('Adding to averaged_image...') averaged_image_chunk = result.get() non_null_mask_count_image = np.add(non_null_mask_count_image, averaged_image_chunk) print('Done.') averaged_image_union = np.divide(averaged_image_union, non_null_mask_count_image, out=averaged_image_union, where=non_null_mask_count_image>0) averaged_image_union_obj = nib.Nifti1Image(averaged_image_union, affine=im_affine, header=im_header) nib.save(averaged_image_union_obj, args.out_union) print('Done.')
def fetch_spacex_last_launch(): images = get_spacex_last_launch_images() for image_number, image_url in enumerate(images): download_file( image_url, 'spacex{}.{}'.format(image_number, get_extension(image_url)))
def startSpider(self): if self.spider_queue.empty(): fetched_users = self.db.execute('SELECT * from spider_list ORDER BY weight DESC limit 0,20') if fetched_users <= 0: print('nothing to spider,spider_list is empty') return False self.start = 'start' self.errno = ERR_NO fetchall = self.db.fetchall() # 将数据库中取出的待爬取的分享者,加入爬取队列 for item in fetchall: self.spider_queue.put({ 'sid': item[0], 'uk': item[1], 'file_fetched': item[2], 'follow_fetched': item[3], 'follow_done': item[4], 'file_done': item[5], 'weight': item[6], 'uid': item[7] }) self.got_follow_count = 0 self.got_files_count = 0 self.while_count = 0 while not self.spider_queue.empty(): self.while_count += 1 share_user = self.spider_queue.get() # 爬取分享者的文件列表 if not share_user['file_done']: print('%d now spidering file ,%d file fetched' % (share_user['uk'], share_user['file_fetched'])) rs = self.getShareLists(share_user['uk'], share_user['file_fetched']) if not rs: print('uk:%d error to fetch files,try again later...' % share_user['uk']) return True total_count, fetched_count, file_list = rs total_fetched = share_user['file_fetched'] + fetched_count print('fetched_file_count:%d' % fetched_count) if total_fetched >= total_count or total_count == 0: share_user['file_done'] = 1 # 该分享者所有文件爬取完成 if total_count == 0: self.db.execute("UPDATE spider_list set file_done=%s WHERE sid=%s", (1, share_user['sid'])) self.db.commit() else: try: files_count = 0 for file in file_list: files_count += 1 ext = '' file_type = '' file_type_i = -1 if file['isdir'] == 0 and file['feed_type'] == 'share': ext = utils.get_extension(file['title']).lower() file_type = utils.get_category(ext) file_type_i = self.file_type_t[file_type] time_stamp = int(time.time()) self.db.execute( "INSERT INTO share_file (title,uk,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (file['title'], file['uk'], file['shareid'], file['shorturl'], file['isdir'], file['size'], file['md5'], ext, file['feed_time'], time_stamp, file_type_i, share_user['uid'], file['feed_type']) ) except: share_user['file_done'] = 0 self.db.rollback() traceback.print_exc() return False else: self.db.execute("UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s", (total_fetched, share_user['file_done'], share_user['sid'])) self.db.execute("UPDATE share_users set fetched=%s WHERE uid=%s", (total_fetched, share_user['uid'])) share_user['file_fetched'] = total_fetched self.got_files_count += files_count self.db.commit() # 爬取完文件后在爬取订阅列表 if share_user['follow_done'] == 0 and share_user['file_done'] == 1: print('%d now spidering follow ,%d follow fetched' % (share_user['uk'], share_user['follow_fetched'])) rs = self.getFollows(share_user['uk'], share_user['follow_fetched']) if not rs: print('error to fetch follows,try again later...') return total_count, fetched_count, follow_list = rs total_fetched = share_user['follow_fetched'] + fetched_count print('fetched_follow_count:%d' % fetched_count) if total_fetched >= total_count or total_count == 0: share_user['follow_done'] = 1 if total_count == 0: self.db.execute("DELETE FROM spider_list WHERE sid=%s", (share_user['sid'],)) self.db.commit() else: try: follow_count = 0 for follow in follow_list: follow_count += 1 # 判断该用户是否已经在表中了 if self.db.execute('SELECT * FROM share_users WHERE uk=%s', (follow['follow_uk'],)) > 0: print('uk:%d has already in share_user table' % follow['follow_uk']) continue time_stamp = int(time.time()) self.db.execute("INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\ fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", ( follow['follow_uk'], follow['follow_uname'], follow['avatar_url'], follow['intro'], follow['follow_count'], follow['album_count'], follow['fans_count'], follow['pubshare_count'], time_stamp, time_stamp, 5 ) ) # 将获取的新分享者加入爬取列表 self.db.execute("INSERT INTO spider_list (uk,uid) VALUES(%s,%s)", (follow['follow_uk'], self.db.last_row_id())) except: share_user['follow_done'] = 0 self.db.rollback() traceback.print_exc() return False else: if share_user['follow_done'] == 1: # 订阅者爬取完成,该分享者的任务完成,从待爬取列表中删除 print('delete follow fetched sid:%d from spider_list' % share_user['sid']) self.db.execute("DELETE FROM spider_list WHERE sid=%s", (share_user['sid'],)) else: self.db.execute("UPDATE spider_list set follow_fetched=%s,follow_done=%s WHERE sid=%s", (total_fetched, share_user['follow_done'], share_user['sid'])) share_user['follow_fetched'] = total_fetched self.got_follow_count += follow_count self.db.commit() # 只要分享者列表没完成,说明该分享者还未爬取完,则加入工作队列,继续爬取 if share_user['follow_done'] == 0: self.spider_queue.put(share_user) else: print('%d has done' % share_user['uk']) del share_user time.sleep(SPIDER_INTERVAL) print('-----------------Done------------------') print('while_count:%d' % self.while_count) print('got_follow_count:%d' % self.got_follow_count) print('got_files_count:%d' % self.got_files_count) return True
np.save(without_extension(_file) + '.npy', encoding) print("(batch_size, time_steps, dimensions) :", encoding.shape) # plotting # if PLOT: fig, axs = plt.subplots(2, 1, figsize=(10, 5)) axs[0].plot(audio) axs[0].set_title('Audio Signal') axs[1].plot(encoding[0]) axs[1].set_title('NSynth Encoding') # decoding # '''Synthesizes audio from the encoding and saves it''' fastgen.synthesize( encoding, save_paths=[without_extension(_file) + "_decoded." + get_extension(_file)], samples_per_save=sample_length) if DEBUG: print("Generation for normal encoding achieved !") # slower and faster encoding # encoding_slower = timestretch(encoding, 1.5) encoding_faster = timestretch(encoding, 0.5) if PLOT: fig, axs = plt.subplots(3, 1, figsize=(10, 7), sharex=True, sharey=True) axs[0].plot(encoding[0]) axs[0].set_title('Encoding (Normal Speed)') axs[1].plot(encoding_faster[0]) axs[1].set_title('Encoding (Faster))')