Exemple #1
0
 def load_train_and_test(self, train, test):
     if isinstance(train, pd.DataFrame) and isinstance(test, pd.DataFrame):
         return train, test
     trainextension, testextension = get_extension(train.basename()), get_extension(test.basename())
     trainloadf, testloadf = getattr(pd, 'read_' + trainextension), getattr(pd, 'read_' + testextension)
     train: pd.DataFrame = trainloadf(train)
     test: pd.DataFrame = testloadf(test)
     return train, test
Exemple #2
0
    def process_methods(self, diff_only=False, changes=[]):
        """
        Main method that documents methods in a file. To any
        file that needs to be documented. Process methods is the
        entry point for getting the whole thing done
        Parameters
        ----------
        bool diff_only: Use a diff only. Consumed by dyc diff.
        list changes: Changes in a file, mainly use also with dyc diff.
        """
        print('\nProcessing Methods\n\r')
        for filename in self.file_list:
            try:
                change = filter(lambda x: x.get('path') == filename,
                                changes)[0]
            except:
                change = None

            extension = get_extension(filename)
            fmt = self.formats.get(extension)
            method_cnf = fmt.get('method', {})
            method_cnf['arguments'] = fmt.get('arguments')
            builder = MethodBuilder(filename, method_cnf)
            builder.initialize(change=change)
            builder.prompts()
            builder.apply()
            builder.clear(filename)
Exemple #3
0
def check_news(item):
    '''check_news(item):
    Performs convalidation on the data loaded from load_news() and raises
    ValueError if something is wrong.
    '''
    if not item['title']:
        raise ValueError(u'Impossibile caricare una notizia senza titolo') 
    if not item['content']:
        raise ValueError(u'Impossibile caricare una notizia senza testo')
        
    # datepick_to_datetime() itself will raise the ValueError in case
    item['date'] = datepick_to_datetime(item['date']) 

    for p in item['photos']:
        if re.match('^image/[A-Za-z]*', p[1].mimetype) and p[0] == '':      # If I have a photo without the label
            raise ValueError( u"Impossibile caricare foto senza una descrizione." )
        # It is possible to UPDATE the label of an existing photo (so a pair label-nophoto is allowed),
        # while it is impossible to load a photo without a label, because the label is always rendered in the page
        # and thus loaded with the photo. In case of UPLOAD I must check again and raise errors for every unpaired label.
    
        if not allowed_pic(p[1].filename):
            raise ValueError( u'''{0} non può essere caricato:<br>
                                  '*.{1}' non è tra le estensioni ammesse (ovvero {2}).<br>
                                  Ricarica le foto.
                               '''.format(p[1].filename, get_extension(p[1].filename), set_to_string(ALLOWED_EXTENSIONS_PICS)) )
                               # Secondary issue: now the user have to reload all the photos. 
                               # It is possible to let him see again what they loaded, and correct?
    return
Exemple #4
0
 def parse_json(self,data):
     
     ipdata = json.loads(data)
     try:
         if ipdata['imgs']:  
             for n in ipdata['imgs']: #data子项 
                 if n['objURL']:  
                     try:
                         proxy_support = urllib2.ProxyHandler(proxy)
                         opener = urllib2.build_opener(proxy_support)
                         urllib2.install_opener(opener)
                         #print "proxy",proxy
                         self.lock()
                         self.dbcurr.execute('select ID from pic_info where objURL=%s', (n['objURL']))
                         y = self.dbcurr.fetchone()
                         #print "y=",y
                         if y:
                             print "database exist"
                             self.unlock() #continue 前解锁
                             continue
                         else:
                             real_extension=utils.get_extension(n['objURL'])
                             req = urllib2.Request(n['objURL'],headers=i_headers)
                             resp = urllib2.urlopen(req,None,5)
                             dataimg=resp.read()
                             name=str(uuid.uuid1())
                             filename=""
                             if len(real_extension)>4:
                                 real_extension=".gif"
                             real_extension=real_extension.lower()
                             if real_extension==".gif":
                                 filename  =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
                                 self.count+=1
                             else:
                                 filename  =self.makeDateFolder("E://sosogif", "o"+str(self.count % 20))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
                                 self.count+=1
                             """
                             name=str(uuid.uuid1())
                             filename=""
                             if len(real_extension)>4:
                                 real_extension=".gif"
                             filename  =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
                             self.count+=1 
                             """
                             try: 
                                 if not os.path.exists(filename): 
                                     file_object = open(filename,'w+b')  
                                     file_object.write(dataimg)  
                                     file_object.close()
                                     self.anaylis_info(n,filename,real_extension) #入库操作
                                 else:
                                     print "file exist" 
                             except IOError,e1:  
                                 print "e1=",e1
                                 pass
                         self.unlock()
                     except IOError,e2:  
                         #print "e2=",e2 
                         pass  
                         self.chance1+=1
    def context_iter(self, context=None):
        """
        Iterate the tree depth-first, producing a context for each node.

        Args:
            context (dict): The parent context object

        Yields:
            dict: The context object for this node
        """
        if not context:
            context = {'parent_container_type': None}

        # Depth-first walk down tree
        context['container_type'] = self.type
        context[self.type] = self
        
        # Bring subject to top-level of context
        if self.type == 'session':
            context['subject'] = self.data['subject']

        # Additionally bring ext up if file
        if self.type == 'file':
            context['ext'] = utils.get_extension(self.data['name'])

        # Yield the current context before processing children
        yield context

        context['parent_container_type'] = self.type
        for child in self.children:
            context_copy = context.copy()
            for ctx in child.context_iter(context_copy):
                yield ctx
def remove_unique_indicators(path):
    print path
    for filename in os.listdir(path):
        if ' (2)' in filename:
            index = filename.index(' (2)')
            extension = utils.get_extension(filename, with_dot=True)
            shorter_filename = filename[:index] + extension
            print filename
            os.rename(os.path.join(path, filename), os.path.join(path, shorter_filename))
def find_train_test():
    where_train = Question("Where is your train dataset under project root?",
                           'data/raw/train.csv')
    where_test = Question("Where is your test dataset under project root?",
                          'data/raw/test.csv')
    train_path = where_train.ask()
    test_path = where_test.ask()
    is_csv = get_extension(train_path) == 'csv'
    if is_csv:
        what_sep = Question('Specify separator into csv file:', ',')
        csv_sep = what_sep.ask()
        train, test = pd.read_csv(train_path,
                                  csv_sep), pd.read_csv(test_path, csv_sep)
    else:
        load_f = getattr(pd, 'read_' + get_extension(train_path))
        train = load_f(train_path)
        load_f = getattr(pd, 'read_' + get_extension(test_path))
        test = load_f(test_path)
    return train, test
def main():
    parser = argparse.ArgumentParser(description='Generate average atlas for an image folder.')
    parser.add_argument('--in_folder', type=str,
                        help='The input image folder')
    parser.add_argument('--out', type=str,
                        help='The output image path (with .nii.gz)')
    parser.add_argument('--ref', type=str,
                        help='Path of reference image. Define the affine and header of output nii.gz')
    parser.add_argument('--num_processes', type=int, default=20)

    args = parser.parse_args()
    file_list_all = os.listdir(args.in_folder)
    print('Process images under folder: ', args.in_folder)
    print('Number of files in folder %s is %d' % (args.in_folder, len(file_list_all)))
    nifti_file_list = [file_path for file_path in file_list_all if get_extension(file_path) == '.gz']
    print('Number of nii.gz files: ', len(nifti_file_list))

    file_name_chunks = get_chunks_list(nifti_file_list, args.num_processes)

    pool = Pool(processes=args.num_processes)

    result_list = [pool.apply_async(average_nii_file_list_mem, (file_name_chunk, args.in_folder)) for file_name_chunk in file_name_chunks]

    # Get the shape.
    # im_temp = nib.load(os.path.join(args.in_folder, nifti_file_list[0]))
    im_temp = nib.load(args.ref)
    im_header = im_temp.header
    im_affine = im_temp.affine
    im_temp_data = im_temp.get_data()
    im_shape = im_temp_data.shape
    averaged_image = np.zeros(im_shape)
    for thread_idx in range(len(result_list)):
        result = result_list[thread_idx]
        result.wait()
        print(f'Thread with idx {thread_idx} / {len(result_list)} is completed')
        print('Adding to averaged_image...')
        averaged_image_chunk = result.get()
        chunk_size = len(file_name_chunks[thread_idx])
        averaged_image = np.add(averaged_image, np.multiply(averaged_image_chunk, chunk_size))
        print('Done.')

    print('')
    print('Averaging over all images...')
    averaged_image = np.divide(averaged_image, len(nifti_file_list))
    print('Done.')

    print('Output to file: ', args.out)
    averaged_image_obj = nib.Nifti1Image(averaged_image, affine=im_affine, header=im_header)
    nib.save(averaged_image_obj, args.out)
def sort_files(path, filetype='gif'):
    """
    Sort files into a specific folder while *not* maintaining existing 
    file structure patterns.
    """
    target_dir = "_{}s".format(filetype.lower())
    target = os.path.join(path, target_dir)
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            if ".ds_store" in filename.lower():
                continue
            extension = utils.get_extension(filename).strip('.')
            if extension == filetype.lower():
                utils.make_dir(target)
                new_name = utils.find_untaken_name(filename, target)
                os.rename(os.path.join(root, filename), os.path.join(target, new_name))
def get_target_file_path(url, file_title, subreddit_target_dir, subfolder=None, new_only=False):
    file_extension = utils.get_extension(url)
    """
    if not utils.has_acceptable_extension(url):
        print "Not an accepted extension."
        continue
    """
    file_title = u"{}.{}".format(file_title, file_extension)
    clean_file_title = strip_rank_from_title(file_title)
    file_path = os.path.join(subreddit_target_dir, file_title)
    if subfolder:
        dir_path = os.path.join(subreddit_target_dir, subfolder)
        file_path = os.path.join(dir_path, file_title)
        make_dirs(dir_path)
    if os.path.isfile(file_path):
        print u"\"{}\" already exists.".format(file_title)
        return False
    print "Pulling {} ...".format(url),
    return file_path
Exemple #11
0
def process(host, path, callback):
    path, workers = _build_workers(path)

    source_file = get_file(host + path)

    if get_extension(path) == 'svg':
        # http://redmine.pearbox.net/issues/1605
        source_image_type = 'svg+xml'
    else:
        image = get_image(source_file)
        exif = image.info.get('exif', b'')
        source_image_type = image.format.upper()

        for worker in workers:
            #extract palette from Image
            pl = image.getpalette()

            image = worker.do(image)

            if pl is not None:
                #if image has palette then restore it
                image.putpalette(pl)

        source_file = StringIO()
        image.save(source_file, source_image_type, exif=exif)

    data = source_file.getvalue()
    data_len = len(data)

    logging.info(
        'Image was successful processed with type {type} and len {data_len}'.
        format(type=source_image_type, data_len=data_len))

    callback('200 OK',
             [('Content-type', 'image/{type}'.format(type=source_image_type)),
              ('Content-length', str(data_len))])

    return [data]
async def create_meme_tempfile(imgpath, text, text_top=None):
    ext = utils.get_extension(imgpath)
    temp = tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False)
    await create_meme(imgpath, temp.name, text, text_top)
    return temp.name
Exemple #13
0
    def parse_json(self, data):

        ipdata = json.loads(data)

        try:

            if ipdata['imgs']:

                for n in ipdata['imgs']:  #data子项

                    if n['objURL']:

                        try:

                            proxy_support = urllib2.ProxyHandler(proxy)

                            opener = urllib2.build_opener(proxy_support)

                            urllib2.install_opener(opener)

                            #print "proxy",proxy

                            self.lock()

                            self.dbcurr.execute(
                                'select ID from pic_info where objURL=%s',
                                (n['objURL']))

                            y = self.dbcurr.fetchone()

                            #print "y=",y

                            if y:

                                print "database exist"

                                self.unlock()  #continue 前解锁

                                continue

                            else:

                                real_extension = utils.get_extension(
                                    n['objURL'])

                                req = urllib2.Request(n['objURL'],
                                                      headers=i_headers)

                                resp = urllib2.urlopen(req, None, 5)

                                dataimg = resp.read()

                                name = str(uuid.uuid1())

                                filename = ""

                                if len(real_extension) > 4:

                                    real_extension = ".gif"

                                real_extension = real_extension.lower()

                                if real_extension == ".gif":

                                    filename = self.makeDateFolder(
                                        "E://sosogif",
                                        "d" + str(self.count % 60)
                                    ) + "//" + name + "-www.sosogif.com-搜搜gif贡献" + real_extension

                                    self.count += 1

                                else:

                                    filename = self.makeDateFolder(
                                        "E://sosogif",
                                        "o" + str(self.count % 20)
                                    ) + "//" + name + "-www.sosogif.com-搜搜gif贡献" + real_extension

                                    self.count += 1
                                """

                                name=str(uuid.uuid1())

                                filename=""

                                if len(real_extension)>4:

                                    real_extension=".gif"

                                filename  =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension

                                self.count+=1

                                """

                                try:

                                    if not os.path.exists(filename):

                                        file_object = open(filename, 'w+b')

                                        file_object.write(dataimg)

                                        file_object.close()

                                        self.anaylis_info(
                                            n, filename, real_extension)  #入库操作

                                    else:

                                        print "file exist"

                                except IOError, e1:

                                    print "e1=", e1

                                    pass

                            self.unlock()

                        except IOError, e2:

                            #print "e2=",e2

                            pass

                            self.chance1 += 1
Exemple #14
0
def update_news(request, cursor, app, id):
    '''update_news(request, cursor, app, id):
    This function updates a news, meaning that it can identify and 
    overwrite a specific row in the database. Basically the same as the 
    above for the text parts, but slightly different concerning the 
    management of pictures.
    NB: it DOESN'T DEAL with the ValueErrors. The caller is supposed to
    manage them.
    '''
    item = load_news(request)
    check_news(item)
    old_pics = retrieve_item("news", id, cursor)['pics']
    
    try:
        for n in xrange(len(item['photos'])):
            if old_pics[n][1] != item['photos'][n][0]:   # If labels are different, update them
                old_pics[n][1] = item['photos'][n][0]
            if item['photos'][n][1].filename != '':     # If I have a new file:
                # Save it
                filename = 'File{0}.{1}'.format(str(datetime.datetime.now()).translate(None, '.:- ')[:-3], get_extension(secure_filename(item['photos'][n][1].filename)))
                item['photos'][n][1].save(os.path.join(app.config['UPLOAD_FOLDER_PICS'], filename))
                # Delete the old one
                try:
                    os.remove(os.path.join(BASE_PATH, app.config['UPLOAD_FOLDER_PICS'], old_pics[n][2]))
                except OSError as e:
                    # If the file there isn't, I simply leave the corrupted one (if exists) orphan.
                    app.logger.error('OSError occurred in update_NEWS, probly orphan file. Error Code: {0}'.format(e))
                # Overwrite te name of the file in the database entry
                old_pics[n][2] = filename
    except IndexError:      # Means that I'm trying to update one more photo, than what I have in old_pics (I'm adding a photo)
        filename = 'File{0}.{1}'.format(str(datetime.datetime.now()).translate(None, '.:- ')[:-3], get_extension(secure_filename(item['photos'][n][1].filename)))
        item['photos'][n][1].save(os.path.join(app.config['UPLOAD_FOLDER_PICS'], filename))
        old_pics.append( (n, item['photos'][n][0], filename) )
                   
    item['pics'] = old_pics
    cursor.execute("UPDATE news SET data=?, title=?, text=?, pics=? WHERE id = ?", [item['date'], item['title'], item['content'], json.dumps(item['pics']), id])

    return item
Exemple #15
0
def upload_doc(request, cursor, app):
    '''upload_doc(request, cursor, app):
    This function performs a fresh upload of all the data previously
    loaded and checked.
    It adds a new row in the database without overwriting anything.
    In case of failure, it returns all the non-checked raw loaded data, 
    to be displayed again to the user and let they correct it.
    NB: it DOESN'T DEAL with the ValueErrors. The caller is supposed to
    manage them.
    '''
    item = load_doc(request)
    check_doc(item)
    
    filename = 'File{0}.{1}'.format(str(datetime.datetime.now()).translate(None, '.:- ')[:-3], get_extension(secure_filename(item['file'].filename)))
    item['file'].save(os.path.join(app.config['UPLOAD_FOLDER_DOCS'], filename))
    
    cursor.execute("INSERT INTO docs (name, path) VALUES (?, ?)", [item['title'], filename])
    return
Exemple #16
0
from datetime import datetime
from collections import defaultdict
from bokeh.plotting import figure
from bokeh.io import export_svgs
from multiprocessing import Pool


def date_to_month(d):
    return datetime(d.year, d.month, 1)


base_dir = "/home/visgean/Dropbox/**/*"

picture_extensions = ['.jpg', '.jpeg', '.png']
pictures = list(filter(
    lambda f: utils.get_extension(f) in picture_extensions,
    glob.iglob(base_dir, recursive=True)
))

with Pool(12) as p:
    exif_data = p.map(utils.get_exif, pictures)
    dates = p.map(utils.parse_date, exif_data)


filesize_counter = defaultdict(int)
image_counter = defaultdict(int)
for filename, date in dates:
    if not date:
        continue

    month = date_to_month(date)
Exemple #17
0
def update_doc(request, cursor, app, id):
    '''update_doc(request, cursor, app, id):
    This function updates a document, meaning that it can identify and 
    overwrite a specific row in the database. 
    NB #1: It doesn't call check_doc(), because the user may want to change
    only the label, without replacing the original file, and reverse.
    NB #2: it doesn't deal with the ValueErrors. The caller is supposed to
    manage them.
    '''
    item = load_doc(request)
    
    if item['file']:
        old_file = retrieve_item('doc', id, cursor)
        
        # First of all I upload the new file
        filename = 'File{0}.{1}'.format(str(datetime.datetime.now()).translate(None, '.:- ')[:-3], get_extension(secure_filename(item['file'].filename)))
        item['file'].save(os.path.join(app.config['UPLOAD_FOLDER_DOCS'], filename))
        cursor.execute("UPDATE docs SET path=? WHERE id = ?", [json.dumps(filename), id])
        
        #Then I remove the old one
        try:
            os.remove(os.path.join(BASE_PATH, app.config['UPLOAD_FOLDER_DOCS'], old_file['path']))
        except OSError as e:
            # If the file there isn't, I simply leave the corrupted one (if exists) orphan.
            app.logger.error('OSError in update_DOC, probly orphan file. Error Code: {0}'.format(e))
        
    if item['title']:
        cursor.execute("UPDATE docs SET name=? WHERE id = ?", [item['title'], id])
    else:
        raise ValueError(u'''Impossibile caricare un documento senza titolo.<br>
                             Se il titolo non è stato caricato automaticamente, contatta il webmaster.''')
    return item
Exemple #18
0
def upload_news(request, cursor, app):
    '''upload_news(request, cursor, app):
    This function performs a fresh upload of all the material previously
    loaded and checked (except for the unvalidation of unpaired labels, 
    that is allowed for the uploading, but not allowed here.)
    It adds a new row in the database without overwriting anything.
    In case of failure, it returns all the non-checked raw loaded data, 
    to be displayed again to the user and let they correct it.
    NB: it DOESN'T DEAL with the ValueErrors. The caller is supposed to
    manage them.
    '''
    item = load_news(request)
    check_news(item)
    
    # Validation
    for l in item['photos']:
        if (not re.match('^image/[A-Za-z]*', l[1].mimetype) ) and l[0]!= None:
            raise ValueError( u"Impossibile caricare una descrizione senza la relativa foto." )

    paths, labels = [], []
    for f in item['photos']:
        filename = 'File{0}.{1}'.format(str(datetime.datetime.now()).translate(None, '.:- ')[:-3], get_extension(secure_filename(f[1].filename)))
        f[1].save(os.path.join(app.config['UPLOAD_FOLDER_PICS'], filename))
        paths.append(filename)
        labels.append(f[0])

    pics = zip(xrange(len(paths)), labels, paths)
    cursor.execute("INSERT INTO news (data, title, text, pics) VALUES (?, ?, ?, ?)", [item['date'], item['title'], item['content'], json.dumps(pics)])
    return
Exemple #19
0
def main():
    # Parse arguments from command line
    parser = argparse.ArgumentParser(
        description='Anonymize a dataset using Mondrian in Spark.')
    parser.add_argument('METADATA', help='json file that describes the job.')
    parser.add_argument('WORKERS',
                        default=4,
                        type=int,
                        help='Number of initial cuts (workers)')
    parser.add_argument('DEMO',
                        default=0,
                        type=int,
                        help='Start tool in demo mode')
    parser.add_argument('TEST',
                        default=0,
                        type=int,
                        help='Start tool in test mode')

    args = parser.parse_args()
    demo = args.DEMO
    test = args.TEST

    start_time = time.time()

    with open(args.METADATA) as fp:
        job = json.load(fp)

    # Create Spark Session
    spark = SparkSession \
        .builder \
        .appName('mondrian') \
        .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    # Enable Arrow-based columnar data transfers
    spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')

    if demo == 1:
        print("\n[*] Spark context initialized")
        print("\tWait for 10 seconds to continue demo...")
        time.sleep(10)

    # Parameters
    filename_in = job['input']
    filename_out = job['output']
    # when repartition is not given it defaults to repartitionByRange
    if 'repartition' in job and \
        job['repartition'] in {'customRepartition',
                               'repartitionByRange',
                               'noRepartition'}:
        repartition = job['repartition']
    else:
        repartition = 'repartitionByRange'
    id_columns = job.get('id_columns', [])
    redact = job.get('redact', False)
    quasiid_columns = job['quasiid_columns']
    sensitive_columns = job.get('sensitive_columns')
    # when column score is not given it defaults to span
    score_functions = {
        'span': span,
        'entropy': entropy,
        'neg_entropy': neg_entropy
    }
    if 'column_score' in job and job['column_score'] in score_functions:
        column_score = score_functions[job['column_score']]
    else:
        column_score = span
    fragments = min(args.WORKERS, job.get('max_fragments', 10**6))
    K = job.get('K')
    L = job.get('L')
    measures = job.get('measures', [])

    # Setup mondrian_fragmentation function
    mondrian = functools.partial(mondrian_fragmentation,
                                 sensitive_columns=sensitive_columns,
                                 is_valid=get_validation_function(K, L))

    # when fraction is not given it defaults to None
    if 'fraction' in job and 0 < job['fraction'] < 1:
        fraction = job['fraction']
    else:
        fraction = None

    # when fragmentation is not given it defaults to quantile_fragmentation
    fragmentation_functions = {
        'mondrian': mondrian,
        'quantile': quantile_fragmentation
    }
    if 'fragmentation' in job and \
            job['fragmentation'] in fragmentation_functions:
        fragmentation = fragmentation_functions[job['fragmentation']]
    else:
        fragmentation = quantile_fragmentation

    if not K and not L:
        raise Exception("Both K and L parameters not given or equal to zero.")
    if L and not sensitive_columns:
        raise Exception(
            "l-diversity needs to know which columns are sensitive.")

    if fraction and fragmentation == mondrian:
        sys.exit('''Sorry, currently mondrian fregmentation criteria is only
         available without sampling.''')

    if demo == 1:
        print("\n[*] Job details initialized")
        print("\tWait for 10 seconds to continue demo...")
        time.sleep(10)

    print('\n[*] Using {} initial partitions\n'.format(fragments))

    # Read file according to extension
    print('[*] Reading from {}\n'.format(filename_in))
    extension = get_extension(filename_in)
    df = spark.read \
        .options(header='true', inferSchema='true') \
        .format(extension).load(filename_in)

    if fraction:
        df = df.sample(fraction=fraction)
    pdf = df.toPandas()
    pdf.info()

    print('\n[*] Fragmentation details\n')
    """
    TODO: Avoid having a single node performing this step for the whole dataset
    """
    if not fraction:
        # Create first cut
        pdf = create_fragments(df=pdf,
                               quasiid_columns=quasiid_columns,
                               column_score=column_score,
                               fragments=fragments,
                               colname='fragment',
                               criteria=fragmentation)

        # Check first cut
        sizes = pdf.groupby('fragment').size()
        print("\n[*] Dataset distribution among fragments\n")
        print(sizes)

        print("\n[*] Dataset with fragmentation info\n")
        print(pdf.head)

        # Compute the range on the quasi-identifiers columns
        # will be useful for information loss evaluation
        quasiid_range = [-1] * len(quasiid_columns)
        for i, column in enumerate(quasiid_columns):
            quasiid_range[i] = span(pdf[column])

        # Recreate the dataframe in a way that is appreciated by pyarrow.
        pdf = pd.DataFrame.from_dict(pdf.to_dict())

        # Create spark dataframe
        df = spark.createDataFrame(pdf)
    else:
        # Compute quantiles on the sample
        column, bins = get_fragments_quantiles(df=pdf,
                                               quasiid_columns=quasiid_columns,
                                               column_score=column_score,
                                               fragments=fragments)

        # Read entire file in distributed manner
        df = spark.read \
            .options(header='true', inferSchema='true').csv(filename_in)
        bins[0] = float(
            "-inf")  # to prevent out of Bucketizer bounds exception
        bins[-1] = float(
            "inf")  # to prevent out of Bucketizer bounds exception

        if len(bins) != 2:
            # split into buckets only if there are more than 1
            bucketizer = Bucketizer(splits=bins,
                                    inputCol=column,
                                    outputCol='fragment')
            df = bucketizer.transform(df)
        else:
            # otherwise assign every row to bucket 0
            df = df.withColumn('fragment', F.lit(0.0))

        # Check first cut
        sizes = df.groupBy('fragment').count()
        print("\n[*] Dataset distribution among fragments\n")
        sizes.show()

        print("\n[*] Dataset with fragmentation info\n")
        df.show()

        # Compute the range on the quasi-identifiers columns
        # will be useful for information loss evaluation
        categoricals = [
            item[0] for item in df.dtypes
            if item[0] in quasiid_columns and item[1].startswith('string')
        ]
        funcs = (F.countDistinct(F.col(cname)) if cname in categoricals else
                 F.max(F.col(cname)) - F.min(F.col(cname))
                 for cname in quasiid_columns)
        quasiid_range = df.agg(*funcs).collect()[0]

    # Create a schema in which identifiers are either not there or strings
    # and quasi identifiers are strings.
    # This is needed because the result of the UDF has to generalize them.
    if not redact:
        schema = T.StructType(
            df.select([
                column for column in df.columns if column not in id_columns
            ]).schema)
    else:
        schema = T.StructType(df.schema)
        for column in id_columns:
            schema[column].dataType = T.StringType()
    for column in quasiid_columns:
        schema[column].dataType = T.StringType()

    # TODO: add a column to the output schema to keep information on the
    #       equivalent classes to avoid reconstructing them from scratch
    #       in the evaluation of the metrics

    if demo == 1 and fragments > 1:
        print("\n[*] Dataset fragmented")
        print("\tWait for 10 seconds to continue demo...")
        time.sleep(10)

    # initialize taxonomies
    quasiid_gnrlz = __generalization_preproc(job, df, spark=spark)

    if demo == 1 and quasiid_gnrlz:
        print("\n[*] Taxonomies data preprocessed")
        print("\tWait for 10 seconds to continue demo...")
        time.sleep(10)

    # Create the pandas udf
    @F.pandas_udf(schema, F.PandasUDFType.GROUPED_MAP)
    def anonymize_udf(pdf):
        adf = anonymize(df=pdf,
                        id_columns=id_columns,
                        redact=redact,
                        quasiid_columns=quasiid_columns,
                        sensitive_columns=sensitive_columns,
                        column_score=column_score,
                        K=K,
                        L=L,
                        quasiid_gnrlz=quasiid_gnrlz)

        # Ensure that the quasi identifier columns have been converted
        # to strings (they are required by the return type).
        for column in quasiid_columns:
            adf[column] = adf[column].astype('object')

        return adf

    if repartition == 'repartitionByRange':
        df = df.repartitionByRange('fragment')
    elif repartition == 'customRepartition':
        df = repartition_dataframe(df, spark)

    print('\n[*] Starting anonymizing the dataframe\n')
    print('Number of DF partitions: {}'.format(df.rdd.getNumPartitions()))
    ''' Debug spark partitioning -> Low performance
    count = 0
    for elem in df.rdd.glom().collect():
       print("Size of Spark Partition {}: {}".format(count, len(elem)))
       count +=1
    '''

    adf = df \
        .groupby('fragment') \
        .applyInPandas(anonymize_udf.func, schema=anonymize_udf.returnType) \
        .cache()

    # Create Discernability Penalty udf
    schema = T.StructType(
        [T.StructField('information_loss', T.LongType(), nullable=False)])

    @F.pandas_udf(schema, F.PandasUDFType.GROUPED_MAP)
    def discernability_penalty_udf(adf):
        dp = discernability_penalty(adf=adf, quasiid_columns=quasiid_columns)
        # pandas_udf requires a pandas dataframe as output
        return pd.DataFrame({'information_loss': [dp]})

    # Create Normalized Certainty Penalty udf
    schema = T.StructType(
        [T.StructField('information_loss', T.DoubleType(), nullable=False)])

    @F.pandas_udf(schema, F.PandasUDFType.GROUPED_MAP)
    def normalized_certainty_penalty_udf(adf):
        gcp = normalized_certainty_penalty(adf=adf,
                                           quasiid_columns=quasiid_columns,
                                           quasiid_range=quasiid_range,
                                           quasiid_gnrlz=quasiid_gnrlz)
        # pandas_udf requires a pandas dataframe as output
        return pd.DataFrame({'information_loss': [gcp]})

    if repartition == 'repartitionByRange':
        adf = adf.repartitionByRange('fragment')
    elif repartition == 'customRepartition':
        adf = repartition_dataframe(adf, spark)

    print('Number of ADF partitions: {}'.format(adf.rdd.getNumPartitions()))
    adf.drop('fragment').show(10)

    print('\n[*] Anonymized dataframe')

    if demo == 1:
        print("\tWait for 10 seconds to continue demo...\n")
        time.sleep(10)

    # dictionary to store test params
    measures_log = {}
    measures_log["fragments"] = fragments
    measures_log["repartition"] = repartition
    measures_log["K"] = K
    measures_log["L"] = L
    measures_log["fraction"] = fraction

    if measures:
        print('[*] Information loss evaluation\n')

    for measure in measures:
        if measure == 'discernability_penalty':
            dp = evaluate_information_loss(adf, discernability_penalty_udf)
            print(f"Discernability Penalty = {dp:.2E}")
            measures_log["DP"] = dp
        elif measure == 'normalized_certainty_penalty':
            ncp = evaluate_information_loss(adf,
                                            normalized_certainty_penalty_udf)
            print(f"Normalized Certainty Penalty = {ncp:.2E}")
            measures_log["NCP"] = ncp
        elif measure == 'global_certainty_penalty':
            gcp = evaluate_information_loss(adf,
                                            normalized_certainty_penalty_udf)
            gcp /= (len(quasiid_columns) * adf.count())
            print(f"Global Certainty Penalty = {gcp:.4f}")
            measures_log["GCP"] = gcp

    # Remove fragmentation information
    adf = adf.drop('fragment')

    # Write file according to extension
    print(f"\n[*] Writing to {filename_out}\n")
    extension = get_extension(filename_out)
    adf.write \
        .mode("overwrite") \
        .options(header=True) \
        .format(extension) \
        .save(filename_out)

    end_time = time.time()
    execution_time = end_time - start_time
    measures_log["timestamp"] = end_time
    measures_log["time"] = execution_time

    if test == 1:
        # Write test params to Hadoop
        test_result_files = [
            "hdfs://namenode:8020/anonymized/test_results.csv",
            "hdfs://namenode:8020/anonymized/artifact_result.csv"
        ]
        print("[*] Creating test configuration file on Hadoop")
        write_test_params(spark, measures_log, test_result_files)

    if demo == 0:
        print("--- %s seconds ---" % (execution_time))

    spark.stop()
    print('\n[*] Done\n')
Exemple #20
0
    def startSpiderWap(self):
        if self.spider_queue.empty():
            fetched_users = self.db.execute(
                'SELECT * from spider_list ORDER BY weight DESC limit 0,30')
            if fetched_users <= 0:
                print 'nothing to spider,spider_list is empty'
                return False
            self.start = 'start'
            self.errno = ERR_NO
            fetchall = self.db.fetchall()
            # 将数据库中取出的待爬取的分享者,加入爬取队列
            for item in fetchall:
                self.spider_queue.put({
                    'sid': item[0],
                    'uk': item[1],
                    'file_fetched': item[2],
                    'follow_fetched': item[3],
                    'follow_done': item[4],
                    'file_done': item[5],
                    'weight': item[6],
                    'uid': item[7]
                })
            self.got_follow_count = 0
            self.got_files_count = 0
            self.while_count = 0

        while not self.spider_queue.empty():
            self.while_count += 1
            share_user = self.spider_queue.get()
            # 爬取分享者的文件列表
            if not share_user['file_done']:
                print '%d now spidering file ,%d  file fetched' % (
                    share_user['uk'], share_user['file_fetched'])
                rs = self.getShareListsWap(share_user['uk'],
                                           share_user['file_fetched'])
                if not rs:
                    print 'uk:%d error to fetch files,try again later...' % share_user[
                        'uk']
                    return True
                total_count, fetched_count, file_list = rs
                total_fetched = share_user['file_fetched'] + fetched_count
                print 'fetched_file_count:%d' % fetched_count
                if total_fetched >= total_count or total_count == 0:
                    share_user['file_done'] = 1  # 该分享者所有文件爬取完成
                if total_count == 0:
                    self.db.execute(
                        "UPDATE spider_list set file_done=%s WHERE sid=%s",
                        (1, share_user['sid']))
                    self.db.commit()
                else:
                    try:
                        files_count = 0
                        for file in file_list:
                            files_count += 1
                            ext = ''
                            file_type = ''
                            file_type_i = -1
                            if file['isdir'] == 0 and file[
                                    'feed_type'] == 'share':
                                ext = utils.get_extension(
                                    file['title']).lower()
                                file_type = utils.get_category(ext)
                                file_type_i = self.file_type_t[file_type]
                            time_stamp = int(time.time())
                            self.db.execute(
                                "INSERT INTO share_file (title,uk,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                                (file['title'], file['uk'], file['shareid'],
                                 file['shorturl'], file['isdir'], file['size'],
                                 file['md5'], ext, file['feed_time'],
                                 time_stamp, file_type_i, share_user['uid'],
                                 file['feed_type']))
                    except:
                        share_user['file_done'] = 0
                        self.db.rollback()
                        traceback.print_exc()
                        return False
                    else:
                        self.db.execute(
                            "UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s",
                            (total_fetched, share_user['file_done'],
                             share_user['sid']))
                        self.db.execute(
                            "UPDATE share_users set fetched=%s WHERE uid=%s",
                            (total_fetched, share_user['uid']))
                        share_user['file_fetched'] = total_fetched
                        self.got_files_count += files_count
                        self.db.commit()

            # 爬取完文件后在爬取订阅列表,wap暂时不爬取
            if share_user['follow_done'] == 0 and share_user['file_done'] == 1:
                share_user['follow_done'] = 1
                print '删除用户:%d' % share_user['sid']
                self.db.execute("DELETE FROM spider_list WHERE sid=%s",
                                (share_user['sid'], ))
                self.db.commit()
            time.sleep(SPIDER_INTERVAL)

        print '-----------------Done------------------'
        print 'while_count:%d' % self.while_count
        print 'got_follow_count:%d' % self.got_follow_count
        print 'got_files_count:%d' % self.got_files_count
        return True
            print("Directory " + folderName + " already exists")

        if not os.path.exists(fileFolderName):
            os.mkdir(fileFolderName)
            print("Directory " + fileFolderName + " Created ")
        else:
            print("Directory " + fileFolderName + " already exists")

        i = 0
        for doc_id in doc_id_list:
            url_final = url_descarga_1 + doc_id + url_descarga_2
            file_path = os.path.join(
                fileFolderName, date_name + "&" + number_list[i] + "&" + entry_number_list[i])
            print("\n" + file_path)

            file_extension = utils.get_extension(url_final)
            file_complete_path = file_path + "." + file_extension

            if os.path.exists(file_complete_path):
                os.remove(file_complete_path)

            file_name = wget.download(url_final, file_complete_path)
            if (file_extension != "pdf"):
                utils.convert_to_pdf(file_name)
            i = i + 1

input("\nPress any key to close")



Exemple #22
0
def main():
    parser = argparse.ArgumentParser(description='Generate average atlas for an image folder.')
    parser.add_argument('--in_folder', type=str,
                        help='The input image folder')
    parser.add_argument('--out_union', type=str,
                        help='The output image path (with .nii.gz)')
    parser.add_argument('--out_inter', type=str,
                        help='The output image path (with .nii.gz)', default='')
    parser.add_argument('--ref', type=str,
                        help='Path of reference image. Define the affine and header of output nii.gz')
    parser.add_argument('--num_processes', type=int, default=10)

    args = parser.parse_args()
    file_list_all = os.listdir(args.in_folder)
    print('Process images under folder: ', args.in_folder)
    print('Number of files in folder %s is %d' % (args.in_folder, len(file_list_all)))
    nifti_file_list = [file_path for file_path in file_list_all if get_extension(file_path) == '.gz']
    print('Number of nii.gz files: ', len(nifti_file_list))

    file_name_chunks = get_chunks_list(nifti_file_list, args.num_processes)

    pool = Pool(processes=args.num_processes)

    # Get the shape.
    # im_temp = nib.load(os.path.join(args.in_folder, nifti_file_list[0]))
    im_temp = nib.load(args.ref)
    im_header = im_temp.header
    im_affine = im_temp.affine
    im_temp_data = im_temp.get_data()
    im_shape = im_temp_data.shape

    averaged_image_union = np.zeros(im_shape)
    averaged_image_inter = np.zeros(im_shape)
    averaged_image_union.fill(np.nan)
    # averaged_image_inter.fill(np.nan)
    non_null_mask_count_image = np.zeros(im_shape)

    if args.out_inter != '':
        print('Average in intersection:')
        image_average_inter_result_list = [pool.apply_async(sum_images_inter,
                                                            (file_name_chunk, args.in_folder))
                                           for file_name_chunk in file_name_chunks]

        for thread_idx in range(len(image_average_inter_result_list)):
            result = image_average_inter_result_list[thread_idx]
            result.wait()
            print(f'Thread with idx {thread_idx} / {len(image_average_inter_result_list)} is completed')
            print('Adding to averaged_image...')
            averaged_image_chunk = result.get()
            averaged_image_inter = add_image_inter(averaged_image_inter, averaged_image_chunk)
            print('Done.')

        averaged_image_inter = np.divide(averaged_image_inter,
                                         len(nifti_file_list),
                                         out=averaged_image_inter,
                                         where=np.logical_not(np.isnan(averaged_image_inter)))
        average_image_inter_obj = nib.Nifti1Image(averaged_image_inter, affine=im_affine, header=im_header)
        print(f'Saving to {args.out_inter}')
        nib.save(average_image_inter_obj, args.out_inter)
        print('Done.')
        print('')

    print('Average in union')
    image_average_union_result_list = [pool.apply_async(sum_images_union,
                                                        (file_name_chunk, args.in_folder))
                                       for file_name_chunk in file_name_chunks]

    for thread_idx in range(len(image_average_union_result_list)):
        result = image_average_union_result_list[thread_idx]
        result.wait()
        print(f'Thread with idx {thread_idx} / {len(image_average_union_result_list)} is completed')
        print('Adding to averaged_image...')
        averaged_image_chunk = result.get()
        averaged_image_union = add_image_union(averaged_image_union, averaged_image_chunk)
        print('Done.')

    non_null_mask_count_result = [pool.apply_async(sum_non_null_count,
                                                   (file_name_chunk, args.in_folder))
                                  for file_name_chunk in file_name_chunks]

    for thread_idx in range(len(non_null_mask_count_result)):
        result = non_null_mask_count_result[thread_idx]
        result.wait()
        print(f'Thread with idx {thread_idx} / {len(non_null_mask_count_result)} is completed')
        print('Adding to averaged_image...')
        averaged_image_chunk = result.get()
        non_null_mask_count_image = np.add(non_null_mask_count_image, averaged_image_chunk)
        print('Done.')

    averaged_image_union = np.divide(averaged_image_union,
                                     non_null_mask_count_image,
                                     out=averaged_image_union,
                                     where=non_null_mask_count_image>0)

    averaged_image_union_obj = nib.Nifti1Image(averaged_image_union, affine=im_affine, header=im_header)
    nib.save(averaged_image_union_obj, args.out_union)
    print('Done.')
def fetch_spacex_last_launch():
    images = get_spacex_last_launch_images()
    for image_number, image_url in enumerate(images):
        download_file(
            image_url, 'spacex{}.{}'.format(image_number,
                                            get_extension(image_url)))
Exemple #24
0
    def startSpider(self):
        if self.spider_queue.empty():
            fetched_users = self.db.execute('SELECT * from spider_list ORDER BY weight DESC limit 0,20')
            if fetched_users <= 0:
                print('nothing to spider,spider_list is empty')
                return False
            self.start = 'start'
            self.errno = ERR_NO
            fetchall = self.db.fetchall()
            # 将数据库中取出的待爬取的分享者,加入爬取队列
            for item in fetchall:
                self.spider_queue.put({
                    'sid': item[0],
                    'uk': item[1],
                    'file_fetched': item[2],
                    'follow_fetched': item[3],
                    'follow_done': item[4],
                    'file_done': item[5],
                    'weight': item[6],
                    'uid': item[7]
                })
            self.got_follow_count = 0
            self.got_files_count = 0
            self.while_count = 0

        while not self.spider_queue.empty():
            self.while_count += 1
            share_user = self.spider_queue.get()
            # 爬取分享者的文件列表
            if not share_user['file_done']:
                print('%d now spidering file ,%d  file fetched' % (share_user['uk'], share_user['file_fetched']))
                rs = self.getShareLists(share_user['uk'], share_user['file_fetched'])
                if not rs:
                    print('uk:%d error to fetch files,try again later...' % share_user['uk'])
                    return True
                total_count, fetched_count, file_list = rs
                total_fetched = share_user['file_fetched'] + fetched_count
                print('fetched_file_count:%d' % fetched_count)
                if total_fetched >= total_count or total_count == 0:
                    share_user['file_done'] = 1  # 该分享者所有文件爬取完成
                if total_count == 0:
                    self.db.execute("UPDATE spider_list set file_done=%s WHERE sid=%s", (1, share_user['sid']))
                    self.db.commit()
                else:
                    try:
                        files_count = 0
                        for file in file_list:
                            files_count += 1
                            ext = ''
                            file_type = ''
                            file_type_i = -1
                            if file['isdir'] == 0 and file['feed_type'] == 'share':
                                ext = utils.get_extension(file['title']).lower()
                                file_type = utils.get_category(ext)
                                file_type_i = self.file_type_t[file_type]
                            time_stamp = int(time.time())
                            self.db.execute(
                                    "INSERT INTO share_file (title,uk,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                                    (file['title'], file['uk'], file['shareid'],
                                     file['shorturl'], file['isdir'], file['size'], file['md5'], ext, file['feed_time'],
                                     time_stamp, file_type_i, share_user['uid'], file['feed_type'])
                            )
                    except:
                        share_user['file_done'] = 0
                        self.db.rollback()
                        traceback.print_exc()
                        return False
                    else:
                        self.db.execute("UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s",
                                        (total_fetched, share_user['file_done'], share_user['sid']))
                        self.db.execute("UPDATE share_users set fetched=%s WHERE uid=%s",
                                        (total_fetched, share_user['uid']))
                        share_user['file_fetched'] = total_fetched
                        self.got_files_count += files_count
                        self.db.commit()

            # 爬取完文件后在爬取订阅列表
            if share_user['follow_done'] == 0 and share_user['file_done'] == 1:
                print('%d now spidering follow ,%d  follow fetched' % (share_user['uk'], share_user['follow_fetched']))
                rs = self.getFollows(share_user['uk'], share_user['follow_fetched'])
                if not rs:
                    print('error to fetch follows,try again later...')
                    return
                total_count, fetched_count, follow_list = rs
                total_fetched = share_user['follow_fetched'] + fetched_count
                print('fetched_follow_count:%d' % fetched_count)
                if total_fetched >= total_count or total_count == 0:
                    share_user['follow_done'] = 1
                if total_count == 0:
                    self.db.execute("DELETE FROM spider_list WHERE sid=%s", (share_user['sid'],))
                    self.db.commit()
                else:
                    try:
                        follow_count = 0
                        for follow in follow_list:
                            follow_count += 1
                            # 判断该用户是否已经在表中了
                            if self.db.execute('SELECT * FROM share_users WHERE uk=%s', (follow['follow_uk'],)) > 0:
                                print('uk:%d has already in share_user table' % follow['follow_uk'])
                                continue
                            time_stamp = int(time.time())
                            self.db.execute("INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\
                                fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                                            (
                                                follow['follow_uk'], follow['follow_uname'], follow['avatar_url'],
                                                follow['intro'], follow['follow_count'],
                                                follow['album_count'], follow['fans_count'], follow['pubshare_count'],
                                                time_stamp, time_stamp, 5
                                            )
                                            )
                            # 将获取的新分享者加入爬取列表
                            self.db.execute("INSERT INTO spider_list (uk,uid) VALUES(%s,%s)",
                                            (follow['follow_uk'], self.db.last_row_id()))
                    except:
                        share_user['follow_done'] = 0
                        self.db.rollback()
                        traceback.print_exc()
                        return False
                    else:
                        if share_user['follow_done'] == 1:
                            # 订阅者爬取完成,该分享者的任务完成,从待爬取列表中删除
                            print('delete follow fetched sid:%d from spider_list' % share_user['sid'])
                            self.db.execute("DELETE FROM spider_list WHERE sid=%s", (share_user['sid'],))
                        else:
                            self.db.execute("UPDATE spider_list set follow_fetched=%s,follow_done=%s WHERE sid=%s",
                                            (total_fetched, share_user['follow_done'], share_user['sid']))
                        share_user['follow_fetched'] = total_fetched
                        self.got_follow_count += follow_count
                        self.db.commit()
            # 只要分享者列表没完成,说明该分享者还未爬取完,则加入工作队列,继续爬取
            if share_user['follow_done'] == 0:
                self.spider_queue.put(share_user)
            else:
                print('%d has done' % share_user['uk'])
                del share_user
            time.sleep(SPIDER_INTERVAL)

        print('-----------------Done------------------')
        print('while_count:%d' % self.while_count)
        print('got_follow_count:%d' % self.got_follow_count)
        print('got_files_count:%d' % self.got_files_count)
        return True
np.save(without_extension(_file) + '.npy', encoding)
print("(batch_size, time_steps, dimensions) :", encoding.shape)

# plotting #
if PLOT:
    fig, axs = plt.subplots(2, 1, figsize=(10, 5))
    axs[0].plot(audio)
    axs[0].set_title('Audio Signal')
    axs[1].plot(encoding[0])
    axs[1].set_title('NSynth Encoding')

# decoding #
'''Synthesizes audio from the encoding and saves it'''
fastgen.synthesize(
    encoding,
    save_paths=[without_extension(_file) + "_decoded." + get_extension(_file)],
    samples_per_save=sample_length)

if DEBUG:
    print("Generation for normal encoding achieved !")

# slower and faster encoding #
encoding_slower = timestretch(encoding, 1.5)
encoding_faster = timestretch(encoding, 0.5)

if PLOT:
    fig, axs = plt.subplots(3, 1, figsize=(10, 7), sharex=True, sharey=True)
    axs[0].plot(encoding[0])
    axs[0].set_title('Encoding (Normal Speed)')
    axs[1].plot(encoding_faster[0])
    axs[1].set_title('Encoding (Faster))')