Python Pool.Pool Beispiele, pathos.multiprocessing.Pool.Pool Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: inspector.py Projekt: PierreTDarancet/QuantumTransduction

def main():
    total_timer = Timer()
    iteration_timer = Timer()
    short_timer = Timer()
    total_timer.start()

    logger.success(' --- Welcome to the Kwantum Transmission Device Inspector --- ')

    parser = Parser()
    pool = Pool(nodes=parser.config['n_cpus'])
    logger.info('Running calculations with ' + str(parser.config['n_cpus']) + ' workers.')
            
    ga = GA(parser, objective_function=objectiveFunction)
    structures = ga.generator.generateAll(pool=pool, seeds=np.random.randint(0, 2**32 - 1, parser.config['GA']['n_structures']))

    s = structures[0]


    s.visualizeSystem(args={'dpi': 600, 'file': 'system.png'})
    # fig, axes = plt.subplots(3, 2, figsize=(10,15))
    import matplotlib.gridspec as gridspec

    fig = plt.figure(figsize=(40,10))
    outer = gridspec.GridSpec(2, 1)

    top = gridspec.GridSpecFromSubplotSpec(1, 5, subplot_spec=outer[0], wspace=0.2, hspace=0.2)

    bs_axis = plt.Subplot(fig, top[0])
    ms, bs = s.getBandStructure(0)
    bs_axis.plot(ms, bs, c='k')
    bs_axis.set_xlabel('Wavenumber [\AA${}^{-1}$]')
    bs_axis.set_ylabel('Energy [eV]')
    fig.add_subplot(bs_axis)

    cs_axis = plt.Subplot(fig, top[1])
    es, cs = s.getConductance(0, 1)
    cs_axis.plot(es, cs, c='k')
    cs_axis.set_ylabel('Transmission Function')
    cs_axis.set_xlabel('Energy [eV]')
    fig.add_subplot(cs_axis)

    dos_axis = plt.Subplot(fig, top[2])
    es, ds = s.getDOS()
    dos_axis.plot(es, ds / np.sum(ds), c='k')
    dos_axis.set_ylabel('Density of States')
    dos_axis.set_xlabel('Energy [eV]')
    dos_axis.set_ylim([0.0, 0.1 * np.max(ds / np.sum(ds))])
    fig.add_subplot(dos_axis)

    vcs_axis = plt.Subplot(fig, top[3])
    cvs = [s.getValleyPolarizedConductance(energy, 0, 1) for energy in es]
    cvs = np.array(cvs)
    vcs_axis.plot(es, cvs[:, 0], 'k', label='$k\'$')
    vcs_axis.plot(es, cvs[:, 1], 'k--', label='$k$')
    vcs_axis.set_ylabel('Transmission Function')
    vcs_axis.set_xlabel('Energy [eV]')
    # vcs_axis.set_xlim([-0.5, 0.5])
    vcs_axis.legend()
    fig.add_subplot(vcs_axis)

    crs_axis = plt.Subplot(fig, top[4])
    biases = np.linspace(0.05, 0.5, 64)
    threeK = 0.00025851991
    currents_3 = pool.map(s.getCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK]*biases.shape[0])
    vcs_3 = pool.map(s.getValleyPolarizedCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK]*biases.shape[0])
    vcs_3 = np.array(vcs_3)

    currents_30 = pool.map(s.getCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK]*biases.shape[0])
    vcs_30 = pool.map(s.getValleyPolarizedCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK*10]*biases.shape[0])
    vcs_30 = np.array(vcs_30)

    currents_300 = pool.map(s.getCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK]*biases.shape[0])
    vcs_300 = pool.map(s.getValleyPolarizedCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK*100]*biases.shape[0])
    vcs_300 = np.array(vcs_300)

    crs_axis.semilogy(biases, currents_3, 'k', label='Total - 3 K')
    crs_axis.semilogy(biases, vcs_3[:,0], 'r', label='$k\'$ - 3 K')
    crs_axis.semilogy(biases, vcs_3[:,1], 'b', label='$k$ - 3 K')

    crs_axis.semilogy(biases, currents_30, 'k--', label='Total - 30 K')
    crs_axis.semilogy(biases, vcs_30[:,0], 'r--', label='$k\'$ - 30 K')
    crs_axis.semilogy(biases, vcs_30[:,1], 'b--', label='$k$ - 30 K')  

    crs_axis.semilogy(biases, currents_300, 'k-.', label='Total - 300 K')
    crs_axis.semilogy(biases, vcs_300[:,0], 'r-.', label='$k\'$ - 300 K')
    crs_axis.semilogy(biases, vcs_300[:,1], 'b-.', label='$k$ - 300 K')  

    crs_axis.set_xlabel('Bias [V]')
    crs_axis.set_ylabel('Current [$e / \pi \hbar$]')
    crs_axis.legend()
    fig.add_subplot(crs_axis)

    sys_axis = plt.Subplot(fig, outer[1])
    s.visualizeSystem(args={'ax': sys_axis})
    fig.add_subplot(sys_axis)

    for axis in fig.get_axes():
        axis.grid(linestyle='--', linewidth=0.5)
    
    plt.tight_layout()
    plt.savefig('inspection.pdf')

    plt.show()

    logger.success(' --- Elapsed time: %s ---' % (total_timer.stop()))

Beispiel #2

0

Datei anzeigen

    OpenDir = args.OpenDir
    os.chdir(OpenDir)
    au_path = os.path.join('all_aus', 'au_*.hdf')

    if args.refresh:

        if not os.path.exists('all_aus'):
            os.mkdir('all_aus')
        PATIENT_DIRS = [
            x for x in glob.glob('*cropped') if 'hdfs' in os.listdir(x)
        ]
        dfs = []
        df = None
        patient_queue = multiprocessing.Manager().Queue()
        partial_patient_func = functools.partial(load_patient, patient_queue)
        with Pool() as p:
            max_ = len(PATIENT_DIRS)
            with tqdm(total=max_) as pbar:
                for i, _ in enumerate(
                        p.imap(partial_patient_func,
                               PATIENT_DIRS[:max_],
                               chunksize=100)):
                    pbar.update()

        df = dd.concat(dump_queue(patient_queue), interleave_partitions=True)
        del patient_queue
        gc.collect()

        # print(dd.stack(dfs))
        # df = df.compute()
        df.to_hdf(au_path, '/data', format='table',

Beispiel #3

0

Datei anzeigen

    se.crosstalk(EXPFILE, '', **args)
    nccd = len(CCD)
    xfiles_list = glob.glob('*_xtalk.fits')
    if len(xfiles_list) < nccd:
        print " Possibly corrupted file expect %d extensions but got only %d \n" % (
            nccd, len(xfiles_list))
        sys.exit(-1)
#running pixelcorrect and bleedmask
    se.link_from_Dcache(se.data_conf + 'default.psf')

    instrings = []
    for ccd in CCD:
        ccdstring = "%02d" % int(ccd)
        instrings.append(ccdstring)

    pool = Pool(ncpu)
    pars = [(se, ccdstring) for ccdstring in instrings]
    pool.map(runL1P, pars)
    se.fileclean('xtalk', '.fits')
    se.fileclean('nullweight', '.fits')

    se.combineFiles('D' + ("%08d" % int(EXPNUM)) + '**sextractor.fits',
                    'Scamp_allCCD_r' + rRun + 'p' + pRun + '.fits')
    try:
        se.sanityCheck('D' + ("%08d" % int(EXPNUM)) + '_' + FILTER + '_01' +
                       '_r' + rRun + 'p' + pRun + '_sextractor.fits')
    except ValueError as err:
        print(err.args)
        sys.exit(-1)
    se.fileclean('bpm', '.fits')
    se.fileclean('biascor', '.fits')

Beispiel #4

0

Datei anzeigen

        w = line["Word"]
        valenceList[w].append(line["V.Mean.Sum"])
        arousalList[w].append(line["A.Mean.Sum"])
        dominanceList[w].append(line["D.Mean.Sum"])

d = "@Ayerad no, well  i hope not. He could ha hasnt been at school fer a wile  but @koast08 doesnt believe he had cancer"
a, b, c = generate_emotion_features(d)

x_text, Y = data_helpers.load_data_and_y_labels("../data/MR/rt-polarity.pos",
                                                "../data/MR/rt-polarity.neg")

valence_feature_list = []
arousal_feature_list = []
dominance_feature_list = []

p = Pool(4)
for (valence_list, arousal_list,
     dominance_list) in p.map(generate_emotion_features, x_text):
    valence_feature_list.append(valence_list)
    arousal_feature_list.append(arousal_list)
    dominance_feature_list.append(dominance_list)

valence_feature_list = numpy.expand_dims(valence_feature_list, axis=2)
arousal_feature_list = numpy.expand_dims(arousal_feature_list, axis=2)
dominance_feature_list = numpy.expand_dims(dominance_feature_list, axis=2)

data_set = "MR"
numpy.save("../dump/" + data_set + "/valence_feature_list",
           valence_feature_list)
numpy.save("../dump/" + data_set + "/arousal_feature_list",
           arousal_feature_list)

Beispiel #5

0

Datei anzeigen

                G = nk.nxadapter.nx2nk(G)
                G.removeSelfLoops()
                #o = nk.overview(G)
                measures = graph_measures(G)
                data.loc[t] = measures

            data.to_csv(outpath)

            print('Save to %s' % outpath)
        except:
            print('Skip %s' % s)
            pass


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir", default='./summaries/graphs', type=str)
    parser.add_argument("--out_dir", default='./summaries/baselines', type=str)
    args = parser.parse_args()

    files = os.listdir(args.data_dir)
    paths = []
    for f in files:
        inpath = os.path.join(args.data_dir, f)
        outpath = os.path.join(args.out_dir, f)
        paths.append((inpath, outpath))

    with Pool() as P:
        P.map(analyze_graphs, paths)

Beispiel #6

0

Datei anzeigen

Datei: map_extractor_adapted2snow.py Projekt: swiss-avalanches/swiss-avalanches.github.io

def main(args):
    def function_(year, args=args):
        extensions = ['hn1', 'hn3', 'hsr2000', 'hsr2500', 'hsrel', 'hstop']
        language = ['de', 'en']
        file_type = ['gif', 'png']
        origin_paths = []
        url = "https://www.slf.ch/fileadmin/user_upload/import/lwdarchiv/public"
        urls = []

        y = year
        for ext in extensions:
            for lan in language:
                for f_type in file_type:
                        origin = os.path.join(*[args.maps_directory,y, ext, lan, f_type])#,"*."+f_type])
                        if(Path(origin).exists()):
                            origin_paths.append(os.path.join(*[origin, "*."+f_type]))
                            urls.append("/".join([url, y, ext, lan, f_type]))

        for i, origin  in enumerate(origin_paths):
            for file_map in glob.glob(origin):
                basename = os.path.basename(file_map)
                #print("name:  " + basename)
                filename = '{}.json'.format(os.path.splitext(basename)[0])
                destination = os.path.join(args.out_path, filename)
                
                file_url = urls[i]+"/"+basename
                #print("url: "+ file_url)
                if Path(destination).exists() and not args.f:
                    print('Skip {} because {} already exists'.format(file_map, destination))
                    continue

                img = Image.open(file_map)
                img = img.convert('RGB')
                img_arr = np.array(img)

                # load mask of this size
                try:
                    binary_mask, landmarks_pix = open_mask(*img_arr.shape[:2])
                except FileNotFoundError:
                    print('Missing mask "{}x{}.gif" for file "{}"'.format(*img_arr.shape[:2], file_map), file=sys.stderr)
                    continue

                #remove grey colors
                nogrey_img_arr = remove_colors(img_arr, shades_grey)
                
                #build colormap
                color_map = build_color_map(nogrey_img_arr)
                
                #map image colors to registered shades 
                new_img_arr = replace_color(nogrey_img_arr, color_map=color_map)
                
                # keep useful colors
                regions_only = keep_colors(new_img_arr, shades_blue)

                # clip the binary mask to remove color key
                regions_only[~binary_mask] = 255
                regions_only = Image.fromarray(regions_only).convert('RGB')
                smoothed = regions_only.filter(ImageFilter.MedianFilter(7))

                pix = np.array(list(map(numpify, landmarks_pix.values())))
                coord = np.array(list(map(numpify, landmarks_pix.keys())))

                # add 1 bias raw
                pix_ext = np.vstack([np.ones((1,pix.shape[0])), pix.T])
                coord_ext = np.vstack([np.ones((1,pix.shape[0])), coord.T])

                T = np.linalg.lstsq(pix_ext.T, coord_ext.T)[0]

                def transform_pix2map(points):
                    """n x 2 array"""
                    points_ext = np.hstack([np.ones((points.shape[0], 1)), points])
                    points_map = points_ext.dot(T)
                    return points_map[:, 1:]

                geo_json = {
                "type": "FeatureCollection",
                "features": []
                }

                
                for snow_level, color in enumerate(shades_blue):
                    for contour in color_contours(smoothed, color):
                        contour_right = contour.copy()
                        contour_right[:,0] = contour[:,1]
                        contour_right[:,1] = contour[:,0]
                        contour_right = transform_pix2map(contour_right)
                        simplifier = vw.Simplifier(contour_right)
                        contour_right = simplifier.simplify(threshold=SMOOTHING_THRESHOLD)
                        geo_json['features'].append({
                            "type": "Feature",
                            "properties": {
                                "date": ".".join([basename[6:8], basename[4:6], basename[0:4]]),
                                "snow_level": snow_level_legend[int(snow_level)],
                                "url": file_url
                            },
                            "geometry": {
                                "type": "Polygon",
                                "coordinates": [ list(reversed(contour_right.tolist())) ]
                            }
                        })

                with open(destination, 'w') as f:
                    print('{} -> {}'.format(file_map, destination))
                    json.dump(geo_json, f)
    with Pool(8) as p:
        p.map(function_, [str(i) for i in range(2002, 2018)])

Beispiel #7

0

Datei anzeigen

def main():
    args = parser.parse_args()
    fastadir = str(Path(args.fastadir).absolute())
    hmmdir = str(Path(args.hmmdir).absolute())
    outdir = str(args.outdir)
    threshold = float(args.evalue)
    threads = int(args.threads)
    already_scanned = args.already_scanned
    no_seqs = args.no_seqs

    p = Pool(threads)

    # Make output directory
    if not os.path.exists(outdir):
        os.system('mkdir ' + outdir)
        outdir = str(Path(outdir).absolute())
    else:
        outdir = str(Path(outdir).absolute())

    # Get list of paths of all fastas
    fastalist_wpath = list(
        map(lambda file: os.path.join(fastadir, file), os.listdir(fastadir)))

    # Get list of all fastas
    fastalist = list(
        map(lambda file: file.split('.faa')[0], os.listdir(fastadir)))

    # Get list of paths of all HMM files
    hmmlist_wpath = list(
        map(lambda file: os.path.join(hmmdir, file), os.listdir(hmmdir)))

    # Get list of all HMMs
    hmmlist = list(map(lambda file: file.split('.hmm')[0], os.listdir(hmmdir)))

    hmm_outfiles = []

    def get_fastaheader_id(fasta):
        for rec in SeqIO.parse(fasta, 'fasta'):
            if '.peg' in rec.id:
                id = rec.id.split('.peg')[0]
            elif '|' in rec.id:
                id = rec.id.split('|')[0]
            else:
                print('Unrecognized header found. Aborting.')
                sys.exit()
            break
        return id

    #Get list of fasta header IDs by mapping to get_fastaheader_id fn
    fasta_header_ids = list(map(get_fastaheader_id, fastalist_wpath))

    #Make fasta dictionary (hopefully deprecated, let's see; dec 18 3:58 mountain time)
    fastadict = dict(zip(fasta_header_ids, fastalist))

    # For each fasta, run all hmms
    if not already_scanned:
        for fastafile in fastalist_wpath:
            fastaoutdir = outdir + '/' + fastafile.split('/')[-1].split(
                '.faa')[0]
            # Make outdir for HMMs
            if not os.path.exists(fastaoutdir):
                os.system('mkdir ' + outdir + '/' +
                          fastafile.split('/')[-1].split('.faa')[0])
            #Make symbolic link
            os.system('ln -s ' + fastafile + ' ' + fastaoutdir + '/')
            hmm_outfiles.append([])

            # Run all HMMs for fastafile
            hmm_outfiles[-1] = list(p.map(lambda hmmfile: run_hmmsearch(fastafile, hmmfile, outdir, threshold), \
                                          hmmlist_wpath))

            # Move all outfiles to corresponding output directory
            for outfile in hmm_outfiles[-1]:
                os.system('mv ' + outdir + '/' + outfile + ' ' + fastaoutdir)

    # Make directory to store fastas
    if not os.path.exists(outdir + '/' + 'fastas'):
        os.system('mkdir ' + outdir + '/' + 'fastas')

    # Make matrix of zeros to store hits

    hits_by_hmm = []

    #Declare function to get hits for each HMM
    def extract_all_hits(fastaname, hmm):
        fastadir = outdir + '/' + fastaname
        #Get name of appropriate hmmfile, path
        hmmhits_for_fasta = list(
            filter(lambda x: hmm in x, os.listdir(fastadir)))
        hits = extract_hits_by_outfile(fastadir, hmmhits_for_fasta)
        return hits

    for hmm in hmmlist:
        print("Extracting hits for: ", hmm)
        relevant_outfiles = []
        hits_by_hmm.append([
            list(
                p.map(lambda fastaname: extract_all_hits(fastaname, hmm),
                      fastalist)), hmm
        ])

    print("Making hits matrix...")
    hitstable = np.zeros((len(hmmlist), len(fastalist)))

    # Mark hits in table
    for hmm_idx, hmm in enumerate(hits_by_hmm):
        for genome_idx, genome_hits in enumerate(hmm[0]):
            if type(genome_hits) is list:
                hits = len(genome_hits)
            elif type(genome_hits) is str:
                hits = 1
            if genome_hits is None:
                hitstable[hmm_idx][genome_idx] = 0
            else:
                hitstable[hmm_idx][genome_idx] = hits

    hits = pd.DataFrame(hitstable).T
    hits.columns = hmmlist
    hits['id'] = fastalist

    cols = list(hits.columns.values)
    cols.pop(cols.index('id'))
    hits = hits[['id'] + cols]
    hits.to_csv(outdir + '/HITSTABLE.tsv', sep='\t', index=False)

    if not no_seqs:
        hmms_written = list(
            p.map(
                lambda hits: get_recs_for_hits(hits[0], hits[
                    1], fastadict, fastalist_wpath, fastalist, outdir),
                hits_by_hmm))
        for hmm in hmmlist:
            if hmm not in hmms_written:
                print(hmm)
        sys.exit()

    # recs_by_hmm = list(map(lambda hits: get_recs_for_hits(hits), hits_by_hmm))
    print('boogie')

Beispiel #8

0

Datei anzeigen

Datei: surfsearchlight.py Projekt: ruyuanzhang/RZutilpy

def surfsearchlight(surf, datafile, func, radius=3, openmp=True, mp=4,\
    outprefix=None, intent=2005, verbose=False, method='3dsphere'):
    '''
    def surfsearchlight(surf, datafile, func, radius=3, openmp=True, mp=4,\
        outprefix=None, intent=2005):

    Perform surface-based search light analysis.

    Input:
        <surf>: a string, or path object indicate a surface file either in freesurfer format
            or .gii format. We read in this surface file to obtain geometry of surface vertices
        <datafile>: can be
                (1), .gii, .gii.gz surface data file, it should be in .gii format and the data should
                    be in nVert x M matrix, nVert is the number of vertex, M columns are data
                (2), nVert x M matric that internally can be directly used
        <func>: The function object to calculate, it takes in data file and generate output. Note
            that func either output a single value or output a tuple for multiple results
        <radius>: in mm (default: 3), radius to include vertex
        <openmp>: boolean, whether to use parallal computing, (default=True)
        <mp>: how many cores to open, default:20
        <outPrefix>: a string, we save to a .gii file, if you want to save to .gii file ,you must supply
            a .gii file for <datafile>
        <intent>: an int or a list of ints, intent number for each column of result array. This is necessary
            when saving results into a .gii file. check savegifti.py for more info
        <method>:
            (1) '3dsphere' (default), including vertex within a 3dsphere, typically run on a sphere or inflated surface
            (2) 'geodensic', using geodensic distance, which is more accurate but take a long time
                currently, this method seems problematic, I would not recommand this

    Output:
        We save a .gii file with the output


    20190413 RZ add <method>
    20190412 RZ created the file
    '''
    from numpy import ndarray, vstack, hstack, array, arange, where
    from RZutilpy.system import unix_wrapper, Path, gettimestr
    from RZutilpy.mri import savegifti

    from numpy import ndarray, vstack, array, arange
    from nibabel import load
    from nibabel.freesurfer.io import read_geometry

    from time import time
    from sklearn.neighbors import NearestNeighbors
    from pathos.multiprocessing import Pool

    from surfdist import surfdist

    # first read the surf file
    surf = Path(surf) if ~isinstance(surf, Path) else surf
    if surf.suffix == '.gii':  # .gii format
        vtrx, faces = load(surf.str).darrays[0].data, load(
            surf.str).darrays[1].data
    else:  # freesurfer format
        vtrx, faces = read_geometry(surf.str)

    # and read the data file
    if not isinstance(datafile, ndarray) and isinstance(datafile, str):
        datafile = Path(datafile)
        assert datafile.suffix == '.gii', 'data file should be .gii format!'
        giftiobj = load(datafile.str)
        data = [i.data for i in giftiobj.darrays]
        data = vstack(data).T  # now data is nVert x M columns data file
    else:
        data = datafile

    del datafile

    # assert same number of vertices in surface and data
    assert data.shape[0] == vtrx.shape[
        0], 'surface file and data have different number of vertices!'
    nVtrx = data.shape[0]
    index = range(nVtrx)

    # calculate neighbour
    if method == '3dsphere':
        neigh = NearestNeighbors(radius=radius, metric='euclidean', n_jobs=mp)
        neigh.fit(vtrx)  # in this case we first fit to the x, y, z
        nbrs = neigh.radius_neighbors(vtrx, return_distance=False)
    elif method == 'geodesic':  # slow... do not recommand
        # using surfdist
        def calcneighbors(i):
            print(i)
            dist = surfdist.dist_calc((vtrx, faces), index, i)
            return where(dist <= radius)[0]

        with Pool(mp) as p:
            # use imap, the returned results are in order
            #b = p.imap(calcneighbors, index, chunksize=2000)
            b = p.imap(calcneighbors, range(2000), chunksize=2000)
            nbrs = list(b)
    # note that nbrs is a ndarray, each element is an array since each element might have different

    # Define the wrapper function
    def runsearchlight(i):
        # get the index of neighbors
        idx = nbrs[i]
        # get the data of neighbors
        data_i = data[idx, :]
        if verbose:
            print(i)
        return func(data_i)

    # do it
    tstr = gettimestr('full')
    with Pool(mp) as p:
        # use imap, the returned results are in order
        b = p.imap(runsearchlight, arange(vtrx.shape[0]), chunksize=2000)
        #b = p.imap(runsearchlight, arange(2000), chunksize=2000)
        data2save = list(b)
    print(f'searchlight starts from {tstr}')
    print(f'searchlight ends     at {gettimestr("full")}')

    # let's take about 1d or 2d
    nCol = len(data2save[0]) if isinstance(data2save[0], tuple) else 1
    data2save = array(data2save) if nCol == 1 else vstack(data2save)

    # save the file
    if outprefix:
        assert 'giftiobj' in locals(
        ), 'You must input a .gii file for data if you want to save result to .gii'
        savegifti(data2save, outprefix, giftiobj, intent)

    return data2save

Beispiel #9

0

Datei anzeigen

NUM_SURR = 1000
NUM_WORKERS = 20

net = ScaleSpecificNetwork('%sair.mon.mean.levels.nc' % path_to_data,
                           'air',
                           date(1948, 1, 1),
                           date(2015, 1, 1),
                           None,
                           None,
                           0,
                           dataset="NCEP",
                           sampling='monthly',
                           anom=False)

pool = Pool(NUM_WORKERS)
net.wavelet(1, 'y', pool=pool, cut=1)
net.get_continuous_phase(pool=pool)
net.get_phase_fluctuations(rewrite=True, pool=pool)
pool.close()
pool.join()

nao = DataField()
raw = np.loadtxt("%sWeMO.monthly.1821-2013.txt" % (path_to_data))
raw = raw[:, 1:]
nao.data = raw.reshape(-1)
nao.create_time_array(date_from=date(1821, 1, 1), sampling='m')
nao.select_date(date(1949, 1, 1), date(2014, 1, 1))
nao.anomalise()
jfm_index = nao.select_months([1, 2, 3], apply_to_data=False)

Beispiel #10

0

Datei anzeigen

Datei: network.py Projekt: keesvanginkel/trails

            if len(graph.clusters().subgraph(y).vs) < 500:
                break
            g = graph.clusters().subgraph(y)
            g_edges = edges.loc[edges.id.isin(g.es()['id'])]
            g_nodes = nodes.loc[nodes.id.isin(g.vs()['id'])]
            g_edges, g_nodes = reset_ids(g_edges,g_nodes)
            feather.write_dataframe(g_edges,"/scistor/ivm/data_catalogue/open_street_map/percolation_networks/"+x+"_"+str(counter)+"-edges.feather")
            feather.write_dataframe(g_nodes,"/scistor/ivm/data_catalogue/open_street_map/percolation_networks/"+x+"_"+str(counter)+"-nodes.feather")
            g_df = metrics(g)
            g_df.to_csv("/scistor/ivm/data_catalogue/open_street_map/percolation_metrics/"+x+"_"+str(counter)+"_metrics.csv")
            counter += 1
        print(x+' has finished!')

    except Exception as e: 
        print(x+" failed because of {}".format(e))

if __name__ == '__main__':     
    #countries = ['ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ASM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLM', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CCK', 'CHE', 'CHL', 'CIV', 'CMR', 'COD', 'COG', 'C*K', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CUW', 'CXR', 'CYM', 'CYP', 'CZE', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO', 'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LIE', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', 'MAF', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MNP', 'MOZ', 'MRT', 'MSR', 'MTQ', 'MUS', 'MWI', 'MYS', 'MYT', 'NAM', 'NCL', 'NER', 'NFK', 'NGA', 'NIC', 'NIU', 'NLD', 'NOR', 'NPL', 'NRU', 'NZL', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', 'PSE', 'PYF', 'QAT', 'REU', 'ROU', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SHN', 'SLB', 'SLE', 'SLV', 'SMR', 'SOM', 'SPM', 'SRB', 'SSD', 'STP', 'SUR', 'SVK', 'SVN', 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', 'TCD', 'TGO', 'THA', 'TJK', 'TKM', 'TLS', 'TON', 'TTO', 'TUN', 'TUR', 'TUV', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'UZB', 'VAT', 'VCT', 'VEN', 'VGB', 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'XAD', 'XCA', 'XKO', 'XNC', 'YEM', 'ZAF', 'ZMB', 'ZWE']
    #countries is without CHN, DEU, RUS, USA
    countries = [y[:3] for y in os.listdir("/scistor/ivm/data_catalogue/open_street_map/road_networks/")]
    fin_countries =  [y[:3] for y in os.listdir("/scistor/ivm/data_catalogue/open_street_map/percolation_metrics/")]
    left_countries = list(set(countries)-set(fin_countries))
    left_countries = [x[:3] for x in left_countries]
    from random import shuffle
    #shuffle(left_countries)
    #left_countries = ['BRB', 'BTN', 'KNA', 'GUY', 'NFK', 'BLZ', 'WLF', 'SHN', 'WSM', 'KIR', 'MCO', 'VUT', 'TUV', 'XAD','ASM','FSM','MHL','PLW','VGB','MDV','SLB','VCT']
    print(left_countries)

    with Pool(10) as pool: 
        pool.map(split_record,left_countries,chunksize=1)

Beispiel #11

0

Datei anzeigen

    :param patient_dirs: All directories ran through OpenFace.
    :param patient: Patient to find annotated emotions for
    """
    tqdm_position, patient = patient
    curr_dirs = [x for x in patient_dirs if patient in x]

    for patient_dir in tqdm(curr_dirs, position=tqdm_position):
        find_scores(patient_dir, refresh)


if __name__ == '__main__':
    OPEN_DIR = sys.argv[sys.argv.index('-d') + 1]
    refresh = '--refresh' in sys.argv
    os.chdir(OPEN_DIR)
    # Directories have been previously cropped by CropAndOpenFace
    PATIENT_DIRS = [
        x for x in glob.glob('*cropped') if 'hdfs' in os.listdir(x)
    ]
    PATIENTS = get_patient_names(PATIENT_DIRS)
    # EYEBROW_DICT = process_eyebrows(OPEN_DIR,
    # open(join(OPEN_DIR, 'eyebrows.txt')))
    PARTIAL_FIND_FUNC = functools.partial(find_one_patient_scores,
                                          PATIENT_DIRS, refresh)
    TUPLE_PATIENTS = [((i % 5), x) for i, x in enumerate(PATIENTS)]
    Pool(5).map(PARTIAL_FIND_FUNC, TUPLE_PATIENTS)
    # Pool().map(find_scores, PATIENTS)

    # for i, x in enumerate(PATIENTS):
    # tuple_patient = (i % cpu_count(), x)
    # find_one_patient_scores(PATIENT_DIRS, tuple_patient)

Beispiel #12

0

Datei anzeigen

Datei: HEK293T_analysis.py Projekt: xl27/DeepSequencing_uORF_Project

def generate_SVM_arrays_around_cds_start(list_input_transcripts,
                                         path_to_harringtonine_reads,
                                         path_to_harringtonine_psite,
                                         nthreads):
    """Generates positive and negative vectors for training SVM to predict start peaks using Harringtonine ribosome profiling data. 
        (1) Loops through list of input transcripts, generating positive and negative example vectors from each transcript
            (a) Using the annotated_CDS_start for each transcript as a positive example, constructs the Ingolia vector 
            (b) Constructs Ingolia vectors for 10 negative locations on each transcript as specified in 2011 cell paper 
        (2) Concatenates positive and negative vectors into a single numpy array which is output
        (3) Returns a second array with labels for each vector (1=positive example, 0=negative example)
        
        --Input--
        list_input_transcripts: iterable containing plastid transcript objects to be processed. All supplied transcripts are processed, so be sure you have filtered for highly expressed transcripts, and split into test/training groups beforehand.
        path_to_harringtonine_reads: path to .bam file containing harringtonine reads used to construct the arrays
        path_to_harringtonine_psite: path to file generated by plastid psite script giving psite offsets for harringtonine reads.
        nthreads: number cores to use for parallel processing. Please note that this uses pathos.multiprocessing ProcessingPool and it expects to find 'Pool' defined globally. You MUST run 'from pathos.multiprocessing import ProcessingPool as Pool' before trying this function!
        
        --Output--
        Two np arrays: the first contains Ingolia vectors constructed from the input list. The second contains assignments (1=positive example, ie. vector constructed from annotated CDS_start locations. 0=negative example, constructed as specified in 2011 Cell paper). There are 10 negative examples per 1 positive example on each of the input transcripts passing basic QC filtering.
    """
    def process_single_transcript_forSVM(input_transcript,
                                         path_to_harringtonine_reads,
                                         path_to_harringtonine_psite):
        print 'Working on ' + input_transcript.get_name() + '...'
        #Set up harringtonine reads
        harringtonine_reads = BAMGenomeArray(path_to_harringtonine_reads)
        harringtonine_reads.set_mapping(
            VariableFivePrimeMapFactory.from_file(
                open(path_to_harringtonine_psite)))

        #Set up vectors to append into
        positive_vectors = []
        negative_vectors = []

        #Ensure transcript competant to be a test example
        start_codon_nt = input_transcript.cds_start
        if not start_codon_nt - 25 > 0 or not start_codon_nt + 190 < input_transcript.get_length(
        ):  #Ingolia says -18 on each side of initiation site scoring window. Noting that scoring window is -7 to 40 nt from given site, accounting for negative vector should be 150+40=190 on positive bound.
            return positive_vectors, negative_vectors

        #Create the vectors
        count_vector = input_transcript.get_counts(harringtonine_reads)
        positive_vectors.append(
            construct_Ingolia_vector(start_codon_nt, count_vector))
        for z in [-6, -3, 3, 9, 18, 30, 60, 90, 120, 150]:
            negative_vectors.append(
                construct_Ingolia_vector(start_codon_nt + z, count_vector))
        #
        print '...Done!'
        return positive_vectors, negative_vectors

    #
    output = Pool(nthreads).map(process_single_transcript_forSVM,
                                list_input_transcripts,
                                itertools.repeat(path_to_harringtonine_reads),
                                itertools.repeat(path_to_harringtonine_psite))

    #Unpack mapped output. Each iteration is output as a list, which contains two sub-lists with positive and negative vectors
    positive_vectors = [x[0] for x in output if len(x[0]) > 0]
    negative_vectors = [
        x[1] for x in output if len(x[0]) > 0
    ]  #if x[0] = [], x[1] will also = []. This just removes iterations where the transcript didn't pass QC

    #Second level of unpacking. Take list of lists of vectors to just a list of vectors
    positive_vectors = list(itertools.chain.from_iterable(positive_vectors))
    negative_vectors = list(itertools.chain.from_iterable(negative_vectors))

    ingolia_vector_array = np.asarray(positive_vectors + negative_vectors)
    identities_for_array = np.asarray([1] * len(positive_vectors) +
                                      [0] * len(negative_vectors))
    #
    return ingolia_vector_array, identities_for_array

Beispiel #13

0

Datei anzeigen

Datei: HEK293T_analysis.py Projekt: xl27/DeepSequencing_uORF_Project

#cv_SVM_human = dill.load(open('hg38_HEK_SVM.obj'))

#Compile Regex for Ranges
canonical_start_codon = re.compile('ATG', re.IGNORECASE)
nearcanonical_start_codon = re.compile(
    '(CTG)|(GTG)|(TTG)|(ACG)|(AGG)|(ATC)|(ATT)|(AAG)|(ATA)',
    re.IGNORECASE)  #figure out non-canonicals.
stop_codon = re.compile('(TAA)|(TAG)|(TGA)', re.IGNORECASE)

#Run Main Script. Note that because we have used refit=True with GridSearchCV we can use the SVM directly in the function (which just requires that the classifier has a .predict() method.)
annotated_ORFs = Pool(23).map(assign_uORFs_from_harr_peaks, highly_translated,
                              itertools.repeat('file.path'),
                              itertools.repeat('file.path'),
                              itertools.repeat(cv_SVM_human),
                              itertools.repeat(True),
                              itertools.repeat(scaler_grid),
                              itertools.repeat(hg38_genome),
                              itertools.repeat(canonical_start_codon),
                              itertools.repeat(nearcanonical_start_codon),
                              itertools.repeat(stop_codon),
                              itertools.repeat(50))

#Reformat uORF annotations in a logical way. Start here if coming back from running a model with known settings.
annotated_ORFs_final = [i for i in annotated_ORFs if len(i) > 0]
annotated_ORFs_final = list(
    itertools.chain.from_iterable(annotated_ORFs_final))
#dill.dump(annotated_ORFs_final, open('HEK293T_uORFs.obj','wb'))
#annotated_ORFs_final = dill.load(open('HEK293T_uORFs.obj'))

#Have a look at the data first to see how we did. Looks fine. Definitely fewer 5'UTR reads and fewer uORFs. Cell culture probs?
fout = open('HEK293T_uORFs.bed', 'w')

Beispiel #14

0

Datei anzeigen

def climByAveragingPeriods(urls,              # list of (daily) granule URLs for a long time period (e.g. a year)
                    nEpochs,                  # compute a climatology for every N epochs (days) by 'averaging'
                    nWindow,                  # number of epochs in window needed for averaging
                    nNeighbors,               # number of neighbors on EACH side in lat/lon directions to use in averaging
                    variable,                 # name of primary variable in file
                    mask,                     # name of mask variable
                    coordinates,              # names of coordinate arrays to read and pass on (e.g. 'lat' and 'lon')
                    splitFn=splitModisSst,    # split function to use to partition the input URL list
                    maskFn=qcMask,            # mask function to compute mask from mask variable
                    averager='pixelMean',     # averaging function to use, one of ['pixelMean', 'gaussInterp', 'spatialFilter']
                    averagingConfig={},       # dict of parameters to control the averaging function (e.g. gaussInterp)
                    optimization='fortran',   # optimization mode (fortran or cython)
                    mode='sequential',        # Map across time periods of N-days for concurrent work, executed by:
                                              # 'sequential' map, 'multicore' using pool.map(), 'cluster' using pathos pool.map(),
                                              # or 'spark' using PySpark
                    numNodes=1,               # number of cluster nodes to use
                    nWorkers=4,               # number of parallel workers per node
                    averagingFunctions=AveragingFunctions,    # dict of possible averaging functions
                    legalModes=ExecutionModes,  # list of possible execution modes
                    cachePath=CachePath       # directory to cache retrieved files in
                   ):
    '''Compute a climatology every N days by applying a mask and averaging function.
Writes the averaged variable grid, attributes of the primary variable, and the coordinate arrays in a dictionary.
***Assumption:  This routine assumes that the N grids will fit in memory.***
    '''
    if averagingConfig['name'] == 'gaussInterp':
        averagingConfig['wlat'] = nNeighbors
        averagingConfig['wlon'] = nNeighbors
    try:
        averageFn = averagingFunctions[averager]
    except:
        print('climatology: Error, Averaging function must be one of: %s' % str(averagingFunctions), file=sys.stderr)
        sys.exit(1)

    urlSplits = [s for s in splitFn(urls, nEpochs)]

    def climsContoured(urls, plot=None, fillValue=default_fillvals['f4'], format='NETCDF4', cachePath=cachePath):
        n = len(urls)
        if VERBOSE: print(urls, file=sys.stderr)

        var = climByAveraging(urls, variable, mask, coordinates, maskFn, averageFn, averagingConfig, optimization, cachePath)

        fn = os.path.split(urls[0])[1]
        inFile = os.path.join(cachePath, fn)
        method = averagingConfig['name']
        fn = os.path.splitext(fn)[0]
        day = fn[5:8]
        nDays = int(var['time'][0])

        if 'wlat' in averagingConfig:
            wlat = averagingConfig['wlat']
        else:
            wlat = 1
        if int(wlat) == wlat:
            outFile = 'A%s.L3m_%dday_clim_sst_4km_%s_%dnbrs.nc' % (day, nDays, method, int(wlat))    # mark each file with first day in period
        else:
            outFile = 'A%s.L3m_%dday_clim_sst_4km_%s_%4.2fnbrs.nc' % (day, nDays, method, wlat)    # mark each file with first day in period

        outFile = writeOutNetcdfVars(var, variable, mask, coordinates, inFile, outFile, fillValue, format)

        if plot == 'contour':
            figFile = contourMap(var, variable, coordinates, n, outFile)
        elif plot == 'histogram':
#            figFile = histogram(var, variable, n, outFile)
            figFile = None
        else:
            figFile = None
        return (outFile, figFile)

    if mode == 'sequential':
        results = list(map(climsContoured, urlSplits))
    elif mode == 'multicore':
        pool = Pool(nWorkers)
        results = pool.map(climsContoured, urlSplits)        
    elif mode == 'cluster':
        pass
    elif mode == 'spark':
        pass

    return results

Beispiel #15

0

Datei anzeigen

Datei: computeKon_3Dasym.py Projekt: yangxi1209/msmrd

            y = np.sqrt(1. - u[i]) * np.sin(theta[j])
            z = u[i]
            startpoints.append(radius * np.array([x, y, z]))
    return startpoints


def run_mfpts_from_bath(bathRad, numpoints, scalefactor, dt):
    np.random.seed()
    asympot3D = potentials.asym3Dpotential(scalefactor=scalefactor)
    p1 = mrd.particle(np.zeros(3), 1.0)
    sphereboundary = mrd.reflectiveSphere(bathRad)
    integrator = integrators.brownianDynamicsSp(asympot3D, sphereboundary, p1,
                                                dt, 1.0)
    sim = mrd.simulation(integrator)
    startpoints = get_startpoints(numpoints, bathRad - radiusThreshold)
    fpts = []
    for startpoint in startpoints:
        integrator.pa.position = startpoint
        integrator.clock = 0.
        fpts.append(sim.run_mfpt_points(np.array(minima), 0.2))
    print 'bath to ' + str(bathRad)
    return np.array(fpts)


pool = Pool(processes=8)
FPT_list = pool.map(
    partial(run_mfpts_from_bath, scalefactor=2.0, numpoints=runs, dt=0.001),
    radii)
dill.dump(FPT_list,
          open(path + 'fpts_on_' + str(runs * runs) + '_runs' + suffix, 'wa'))

Beispiel #16

0

Datei anzeigen

Datei: dataset.py Projekt: Lyusungwon/CCM-pytorch

    def init_data(self, data_name, n_chunk=1024):
        print(f'Initializing {data_name} data...')

        def transform_triple_to_hrt(triple_idx):
            """ Transforms triple-idx (as a whole) to h/r/t format """
            if triple_idx == -1:  # for response_triple
                return NAF_TRIPLE
            triple = self.idx2triple[triple_idx]
            h, r, t = triple.split(', ')
            return [self.word2idx[h], self.rel2idx[r], self.word2idx[t]]

        def process_file(root, inp):
            start_i, filename = inp
            n_sample = line_count(filename)

            post = np.zeros((n_sample, self.args.max_sentence_len),
                            dtype=np.int32)
            post_length = np.zeros(
                (n_sample), dtype=np.int32)  # valid length (without pad)
            response = np.zeros((n_sample, self.args.max_sentence_len),
                                dtype=np.int32)
            response_length = np.zeros((n_sample), dtype=np.int32)
            # post_triple = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32)
            triple = np.zeros((n_sample, self.args.max_sentence_len,
                               self.args.max_triple_len, 3),
                              dtype=np.int32)
            entity = np.zeros((n_sample, self.args.max_sentence_len,
                               self.args.max_triple_len),
                              dtype=np.int32)
            response_triple = np.zeros(
                (n_sample, self.args.max_sentence_len, 3), dtype=np.int32)

            max_post_len, max_response_len, max_triple_len = 0, 0, 0

            with jsonlines.open(filename) as df:
                for i, line in enumerate(df):

                    pl, rl = len(line['post']) + 2, len(line['response']) + 2
                    post_length[i] = pl
                    response_length[i] = rl

                    max_post_len = max(pl, max_post_len)
                    max_response_len = max(rl, max_response_len)
                    max_triple_len = max([len(l)
                                          for l in line['all_triples']] +
                                         [max_triple_len])

                    all_triples = [
                        line['all_triples'][i - 1] if i > 0 else [-1]
                        for i in line['post_triples']
                    ]

                    post[i, :pl] = [SOS_IDX] + [
                        self.get_word_idx(p) for p in line['post']
                    ] + [EOS_IDX]
                    response[i, :rl] = [SOS_IDX] + [
                        self.get_word_idx(r) for r in line['response']
                    ] + [EOS_IDX]
                    # post_triple[i, 1:pl-1] = np.array(line['post_triples']) # [0, 0, 1, 0, 2...]
                    response_triple[i, :rl] = [NAF_TRIPLE] + [
                        transform_triple_to_hrt(rt)
                        for rt in line['response_triples']
                    ] + [NAF_TRIPLE]

                    # put NAF_TRIPLE/entity at index 0
                    triple[i] = pad_2d(
                        [[NAF_TRIPLE]] +
                        [[transform_triple_to_hrt(t) for t in triples]
                         for triples in all_triples] + [[NAF_TRIPLE]],
                        length=(self.args.max_sentence_len,
                                self.args.max_triple_len, 3))
                    entity[i] = pad_2d(
                        [[NAF_IDX]] +
                        [[self.entidx2wordidx[e] for e in entities]
                         for entities in line['all_entities']] + [[NAF_IDX]],
                        length=(self.args.max_sentence_len,
                                self.args.max_triple_len))

                # dump to zarr
                root['post'][start_i:start_i + n_sample] = post
                root['post_length'][start_i:start_i + n_sample] = post_length
                root['response'][start_i:start_i + n_sample] = response
                root['response_length'][start_i:start_i +
                                        n_sample] = response_length
                # root['post_triple'][start_i : start_i+n_sample] = post_triple
                root['triple'][start_i:start_i + n_sample] = triple
                root['entity'][start_i:start_i + n_sample] = entity
                root['response_triple'][start_i:start_i +
                                        n_sample] = response_triple

            return max_post_len, max_response_len, max_triple_len

        toread = [
            f'{self.data_path}/{data_name}set_pieces/{piece}'
            for piece in os.listdir(f'{self.data_path}/{data_name}set_pieces')
        ]
        n_lines = sum([line_count(piece) for piece in toread])
        init_n_lines = math.ceil(
            n_lines /
            n_chunk) * n_chunk  # 마지막 조각 사이즈가 지정된 청크 사이즈보다 작아져서 나는 에러 방지

        root = zarr.open(f'{self.data_path}/{data_name}set_new.zarr', mode='w')
        post = root.zeros('post',
                          shape=(init_n_lines, self.args.max_sentence_len),
                          chunks=(n_chunk, None),
                          dtype='i4')
        post_length = root.zeros('post_length',
                                 shape=(init_n_lines, ),
                                 chunks=(n_chunk, ),
                                 dtype='i4')  # valid length (without pad)
        response = root.zeros('response',
                              shape=(init_n_lines, self.args.max_sentence_len),
                              chunks=(n_chunk, None),
                              dtype='i4')
        response_length = root.zeros('response_length',
                                     shape=(init_n_lines, ),
                                     chunks=(n_chunk, ),
                                     dtype='i4')
        post_triple = root.zeros('post_triple',
                                 shape=(init_n_lines,
                                        self.args.max_sentence_len),
                                 chunks=(n_chunk, None),
                                 dtype='i4')
        triple = root.zeros('triple',
                            shape=(init_n_lines, self.args.max_sentence_len,
                                   self.args.max_triple_len, 3),
                            chunks=(n_chunk, None, None, None),
                            dtype='i4')
        entity = root.zeros('entity',
                            shape=(init_n_lines, self.args.max_sentence_len,
                                   self.args.max_triple_len),
                            chunks=(n_chunk, None, None),
                            dtype='i4')
        response_triple = root.zeros('response_triple',
                                     shape=(init_n_lines,
                                            self.args.max_sentence_len, 3),
                                     chunks=(n_chunk, None, None),
                                     dtype='i4')

        pool = Pool(min(len(toread), mp.cpu_count()))
        func = functools.partial(process_file, root)
        iterinp = [(i * self.args.data_piece_size, filename)
                   for i, filename in enumerate(toread)]
        max_post_lens, max_response_lens, max_triple_lens = zip(
            *tqdm(pool.imap(func, iterinp), total=len(iterinp)))

        max_post_len, max_response_len, max_triple_len = max(
            max_post_lens), max(max_response_lens), max(max_triple_lens)

        # trim remaining space
        post.resize(n_lines, max_post_len)
        post_length.resize(n_lines)
        response.resize(n_lines, max_response_len)
        response_length.resize(n_lines)
        post_triple.resize(n_lines, max_post_len)
        triple.resize(n_lines, max_post_len, max_triple_len, 3)
        entity.resize(n_lines, max_post_len, max_triple_len)
        response_triple.resize(n_lines, max_response_len, 3)

        print(
            f'Dumped {data_name} at: {self.data_path}/{data_name}set_new.zarr')

Beispiel #17

0

Datei anzeigen

        def ice_archive(d18Oice, pr_ann, tas_ann, psl_ann, nproc=8):
            ''' Accounts for diffusion and compaction in the firn.

            Args:
                d18Oice (1d array: year in int): annualizd d18O of ice [permil]
                pr_ann (1d array: year in int): precipitation rate [kg m-2 s-1]
                tas_ann (1d array: year in int): annualizd atomspheric temerature [K]
                psl_ann (1d array: year in int): annualizd sea level pressure [Pa]
                nproc (int): the number of processes for multiprocessing

            Returns:
                ice_diffused (1d array: year in int): archived ice d18O [permil]

            '''
            # ======================================================================
            # A.0: Initialization
            # ======================================================================
            # accumulation rate [m/yr]
            # note that the unit of pr_ann is [kg m-2 s-1], so need to divide by density [kg m-3] and convert the time
            yr2sec_factor = 3600*24*365.25
            accum = pr_ann/1000*yr2sec_factor

            # depth horizons (accumulation per year corresponding to depth moving down-core)
            bdown = accum[::-1]
            bmean = np.mean(bdown)
            depth = np.sum(bdown)
            depth_horizons = np.cumsum(bdown)
            dz = np.min(depth_horizons)/10.  # step in depth [m]

            Tmean = np.mean(tas_ann)  # unit in [K]
            Pmean = np.mean(psl_ann)*9.8692e-6  # unit in [Atm]

            # contants
            rho_s = 300.  # kg/m^3, surface density
            rho_d = 822.  # kg/m^2, density at which ice becomes impermeable to diffusion
            rho_i = 920.  # kg/m^3, density of solid ice

            # ======================================================================
            # A.1: Compaction Model
            # ======================================================================
            z = np.arange(0, depth, dz) + dz  # linear depth scale

            # set density profile by calling densification function
            rho, zieq, t = densification(Tmean, bmean, rho_s, z)

            rho = rho[:len(z)]  # cutoff the end
            time_d = np.cumsum(dz/bmean*rho/rho_i)
            ts = time_d*yr2sec_factor  # convert time in years to ts in seconds

            # integrate diffusivity along the density gradient to obtain diffusion length
            D = diffusivity(rho, Tmean, Pmean, rho_d, bmean)

            D = D[:-1]
            rho = rho[:-1]
            diffs = np.diff(z)/np.diff(time_d)
            diffs = diffs[:-1]

            # Integration using the trapezoidal method

            # IMPORTANT: once the ice reaches crtiical density (solid ice), there will no longer
            # be any diffusion. There is also numerical instability at that point. Set Sigma=1E-13 for all
            # points below that threshold.

            # Set to 915 to be safe.
            solidice = np.where(rho >= rho_d-5.0)
            diffusion = np.where(rho < rho_d-5.0)

            dt = np.diff(ts)
            sigma_sqrd_dummy = 2*np.power(rho, 2)*dt*D
            sigma_sqrd = integrate.cumtrapz(sigma_sqrd_dummy)
            diffusion_array = diffusion[0]
            diffusion_array = diffusion_array[diffusion_array < len(sigma_sqrd)]  # fzhu: to avoid the boundary index error
            diffusion = np.array(diffusion_array)

            #  rho=rho[0:-1] # modified by fzhu to fix inconsistency of array size
            #  sigma=np.zeros((len(rho)+1)) # modified by fzhu to fix inconsistency of array size
            sigma = np.zeros((len(rho)))
            sigma[diffusion] = np.sqrt(1/np.power(rho[diffusion],2)*sigma_sqrd[diffusion]) # modified by fzhu to fix inconsistency of array size
            #sigma[solidice]=np.nanmax(sigma) #max diffusion length in base of core // set in a better way. max(sigma)
            sigma[solidice] = sigma[diffusion][-1]
            sigma = sigma[:-1]

            # ======================================================================
            # A.2. Diffusion Profile
            # ======================================================================
            # Load water isotope series
            del18 = np.flipud(d18Oice)  # NOTE YOU MIGHT NOT NEED FLIP UD here. Our data goes forward in time.

            # interpolate over depths to get an array of dz values corresponding to isotope values for convolution/diffusion
            iso_interp = np.interp(z, depth_horizons, del18)

            # Return a warning if the kernel length is approaching 1/2 that of the timeseries.
            # This will result in spurious numerical effects.

            zp = np.arange(-100, 100, dz)
            if (len(zp) >= 0.5*len(z)):
                print("Warning: convolution kernel length (zp) is approaching that of half the length of timeseries. Kernel being clipped.")
                bound = 0.20*len(z)*dz
                zp = np.arange(-bound, bound, dz)

            #  print('start for loop ...')
            #  start_time = time.time()

            rm = np.nanmean(iso_interp)
            cdel = iso_interp-rm

            diffused_final = np.zeros(len(iso_interp))
            if nproc == 1:
                for i in tqdm(range(len(sigma))):
                    sig = sigma[i]
                    part1 = 1./(sig*np.sqrt(2.*np.pi))
                    part2 = np.exp(-zp**2/(2*sig**2))
                    G = part1*part2
                    #  diffused = np.convolve(G, cdel, mode='same')*dz  # fzhu: this is way too slow
                    diffused = signal.fftconvolve(cdel, G, mode='same')*dz  # put cdel in the front to keep the same length as before
                    diffused += rm  # remove mean and then put back
                    diffused_final[i] = diffused[i]

            else:
                #  print('Multiprocessing: nproc = {}'.format(nproc))

                def conv(sig, i):
                    part1 = 1./(sig*np.sqrt(2.*np.pi))
                    part2 = np.exp(-zp**2/(2*sig**2))
                    G = part1*part2
                    diffused = signal.fftconvolve(cdel, G, mode='same')*dz
                    diffused += rm  # remove mean and then put back

                    return diffused[i]

                res = Pool(nproc).map(conv, sigma, range(len(sigma)))
                diffused_final[:len(res)] = np.array(res)

            #  print('for loop: {:0.2f} s'.format(time.time()-start_time))

            # take off the first few and last few points used in convolution
            diffused_timeseries = diffused_final[0:-3]

            # Now we need to pack our data back into single year data units based on the depths and year interpolated data
            final_iso = np.interp(depth_horizons, z[0:-3], diffused_timeseries)
            ice_diffused = final_iso

            return ice_diffused

Beispiel #18

0

Datei anzeigen

Datei: parallel_pathosmulti.py Projekt: rodriguezmDNA/pyTools

    time.sleep(.05)
    result = {'ID':x[0],
            'sepArea':x[1] * x[2],
            'petArea':x[3] * x[4]}
    if verbose: print(f'done with: {x[0]}')
    return result

data = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv')
data.reset_index(inplace=True) #Use idx as identifier

## Convert dataset to tuple with names
irises = tuple(data.itertuples(name='Iris', index=False))
tIris = [tuple(_) for _ in irises]

### Multiprocessing implementation
pool4 = Pool(4)
pool8 = Pool(8)
pool16 = Pool(16)

## Map function over
print('4\n')
start = time.time()
resParallel = pool4.map(getAreaSlowTuples, tIris)
end = time.time()
print(f'time: {end-start:.4f}s\n')

## 8
print('8\n')
start = time.time()
resParallel = pool8.map(getAreaSlowTuples, tIris)
end = time.time()

Beispiel #19

0

Datei anzeigen

def reflection_loss(data=None, f_set=None, d_set=None, **kwargs):
    """

The reflection_loss (RL) function calculates the RL based on the mapping
passed through as the grid variable, done either through multiprocessing
or through the python built-in map() function. The RL function always
uses the interpolation function, even though as the function passes
through the points associated with the input data, solving for the
function at the associated frequencies yields the data point. This
is simply for simplicity.

ref: https://doi.org/10.1016/j.jmat.2019.07.003

                    ---------------------------------------
::

    :param data:   (data)

Permittivity and Permeability data of Nx5 dimensions. Can be a string
equivalent to the directory and file name of either a .csv or .xlsx of Nx5
dimensions. Text above and below data array will be automatically avoided by
the program (most network analysis instruments report data which is compatible
with the required format)

                    ---------------------------------------
::

    :param f_set:   (start, end, [step])

tuple for frequency values in GHz

- if given as list of len 3, results are interpolated
- if given as list of len 2, results are data-derived with the calculation
  bound by the given start and end frequencies
- if f_set is None, frequency is bound to input data

                    ---------------------------------------
::

    :param d_set:   (start, end, step)

tuple for thickness values in mm.

- if d_set is of type list, then the thickness values calculated will only be
  of the values present in the list.

                    ---------------------------------------
::

    :param kwargs:  interp=
                    ('cubic'); 'linear'

Method for interpolation. Set to linear if user wants to linear interp instead
of cubic spline. Default action uses cubic spline.

                    ---------------------------------------
::

    :param kwargs:  override=
                    (None); 'chi zero', 'eps set'

provides response simulation functionality within libRL, common for discerning
which EM parameters are casual for reflection loss. 'chi zero' sets
mu = (1 - j*0). 'eps set' sets epsilon = (avg(e1)-j*0).

                    ---------------------------------------
::

    :param kwargs:  multiprocessing=
                    (False); True, 0, 1, 2, ...

Method for activating multiprocessing functionality for faster run times. This
kwarg takes integers and booleans. Set variable to True or 0 to use all
available nodes. Pass an integer value to use (int) nodes. Will properly handle
'False' as an input though it's equivalent to not even designating the
particular kwarg.

NOTE: if you use the multiprocessing functionality herein while on a Windows
computer you ***MUST MUST MUST MUST*** provide main module protection via the
:code:`if __name__ == "__main__":` conditional so to negate infinite spawns.

                    ---------------------------------------
::

    :param kwargs:  quick_graph=
                    (False); True, str()

Saves a *.png graphical image to a specified location. If set to True, the
quick_graph function saves the resulting graphical image to the location of the
input data as defined by the data input (assuming that the data was input via a
location string. If not, True throws an assertion error). The raw string of a
file location can also be passed as the str() argument, if utilized then the
function will save the graph at the specified location.

                    ---------------------------------------
::

    :param kwargs:  as_dataframe=:
                    (False); True

returns data in a pandas dataframe. This is particularly useful if multicolumn
is also set to true.

                     ---------------------------------------
::

    :param kwargs:  multicolumn=:
                    (False); True

outputs data in multicolumn form with  a numpy array of [RL, f, d] iterated
over each of the three columns.

- if as_dataframe is used, then return value will be a pandas dataframe with
  columns of name d and indexes of name f.

                    ---------------------------------------
::

    :return:        [RL, f, d]

returns Nx3 data set of [RL, f, d] by default

- if multicolumn=True, an NxM dataframe with N rows for the input frequency
  values and M columns for the input thickness values, with pandas dataframe
  headers/indexes of value f/d respectively.
    """
    # data is refactored into a Nx5 numpy array by the file_refactor
    # function from 'refactoring.py'

    if 'quick_graph' in kwargs and kwargs['quick_graph'] is True:
        kwargs['quick_graph'] = refactoring.qgref(data)

    data = refactoring.file_refactor(data)

    # acquire the desired interpolating functions from 'refactoring.py'
    e1f, e2f, mu1f, mu2f = refactoring.interpolate(data, **kwargs)

    # refactor the data sets in accordance to refactoring protocols
    # in 'refactoring.py'
    f_set = refactoring.f_set_ref(f_set, data)
    d_set = refactoring.d_set_ref(d_set)

    # construct a data grid for mapping from refactored data sets
    # d *must* be first as list comprehension cycles through f_set
    # for each d value, and this is deterministic of the structure
    # of the resultant.
    grid = array([(m, n) for n in d_set for m in f_set], dtype=float64)

    # just a constant
    j = cmath.sqrt(-1)

    def gamma(grid):
        f = grid[0]
        d = grid[1]

        # I know, it's super ugly.
        y = (20 * cmath.log10(
            (abs(((1 * (cmath.sqrt((mu1f(f) - j * mu2f(f)) /
                                   (e1f(f) - cmath.sqrt(-1) * e2f(f)))) *
                   (cmath.tanh(j * (2 * cmath.pi * (f * 10**9) *
                                    (d * 0.001) / 299792458) * cmath.sqrt(
                                        (mu1f(f) - j * mu2f(f)) *
                                        (e1f(f) - j * e2f(f)))))) - 1) /
                 ((1 * (cmath.sqrt(
                     (mu1f(f) - j * mu2f(f)) / (e1f(f) - j * e2f(f)))) *
                   (cmath.tanh(j * (2 * cmath.pi * (f * 10**9) *
                                    (d * 0.001) / 299792458) * cmath.sqrt(
                                        (mu1f(f) - j * mu2f(f)) *
                                        (e1f(f) - j * e2f(f)))))) + 1)))))

        # return inputted data for documentation and return
        # the real portion of y to drop complex portion
        # of form j*0
        return y.real, f, d

    # if multiprocessing is given as True or as
    # a zero integer, use all available nodes
    # if multiprocessing is given and is a non-zero
    # integer, use int value for number of nodes
    # if multiprocessing is given as False (for some
    # reason?), or anything else, ignore it.
    # returns res of Zx3 data where Z is the product
    # of len(f_set) and len(d_set)
    if 'multiprocessing' in kwargs and isinstance(kwargs['multiprocessing'],
                                                  int) is True:

        if kwargs['multiprocessing'] is True or kwargs['multiprocessing'] is 0:
            res = array(Pool().map(gamma, grid))
        elif kwargs['multiprocessing'] > 0:
            res = array(Pool(nodes=kwargs['multiprocessing']).map(gamma, grid))
        else:
            res = array(list(map(gamma, grid)))
    else:
        res = array(list(map(gamma, grid)))

    # takes data derived from computation and the file directory string and
    # generates a graphical image at the at location.
    if 'quick_graph' in kwargs and isinstance(kwargs['quick_graph'],
                                              str) is True:

        quick_graphs.quick_graph_reflection_loss(
            results=res, location=kwargs['quick_graph'])

    # formatting option, sometimes professors
    # like 3 columns for each thickness value
    if 'multicolumn' in kwargs and kwargs['multicolumn'] is True:

        # get frequency values from grid so
        # to normalize the procedure due to the
        # various frequency input methods
        gridInt = int(grid.shape[0] / d_set.shape[0])

        # zero-array of NxM where N is the frequency
        # values and M is 3 times the
        # number of thickness values
        MCres = zeros((gridInt, d_set.shape[0] * 3))

        # map the Zx3 result array to the NxM array
        for i in arange(int(MCres.shape[1] / 3)):
            MCres[:, 3 * i:3 * i + 3] = res[i * gridInt:(i + 1) * gridInt, 0:3]

        # stick the MultiColumn Array in the place of the results array
        res = MCres

    if 'as_dataframe' in kwargs and kwargs['as_dataframe'] is True:

        if 'multicolumn' in kwargs and kwargs['multicolumn'] is True:
            res = DataFrame(res[:, ::3])
            res.columns = list(d_set)
            res.index = list(f_set)

        else:
            res = DataFrame(res)
            res.columns = ['RL', 'f', 'd']

    return res

Beispiel #20

0

Datei anzeigen

Datei: germline_filter.py Projekt: zorrodong/cerebra

def germline_filter(processes, germline_path, cells_path, metadata_path,
                    out_path):
    """ filter out common SNPs/indels between germline samples and samples of interest """
    germline_path = Path(germline_path)
    cells_path = Path(cells_path)
    metadata_path = Path(metadata_path)
    out_path = Path(out_path)

    metadata_df = pd.read_csv(metadata_path)

    # Create a set of all patient IDs from the metadata file.
    all_patient_ids = set(metadata_df["patient_id"])

    def process_patient(patient_id):
        # Find all non-tumor bulk VCF files for the patient ID.
        germline_wb_vcf_paths = list(
            germline_path.glob(patient_id + "_*_*.vcf"))

        # Fetch all cell IDs associated with the patient ID.
        cell_ids = metadata_df.loc[metadata_df["patient_id"] ==
                                   patient_id]["cell_id"]

        # Use the cell IDs to create a list of all single-cell VCF files for the patient.
        cell_vcf_paths = [(cells_path / cell_id).with_suffix(".vcf")
                          for cell_id in cell_ids]

        # Create a genome interval tree for the patient's germline bulk VCF
        # data. Only selects one germline VCF to avoid over-filtering for
        # patients with multiple germline VCFs.
        germline_tree = create_germline_genome_tree(germline_wb_vcf_paths[0:1])

        def process_cell(cell_vcf_path):
            if not cell_vcf_path.exists():
                return

            # If there were any germline VCFs for this patient, append `GF_` to
            # the file name to indicate that the output VCF was
            # germline-filtered, not just dbSNP-filtered.
            out_name_prefix = "" if len(germline_wb_vcf_paths) < 1 else "GF_"
            out_vcf_path = out_path / (out_name_prefix + cell_vcf_path.name)

            with open(cell_vcf_path, mode='r') as in_file:
                with open(out_vcf_path, mode='w') as out_file:
                    write_filtered_vcf(in_file, germline_tree, out_file)

        # TODO: Maybe remove this in Python 3.8.
        # This thread pool max-worker count is from the implementation in
        # Python 3.8. Assuming that Pathos adopts the same semantics,
        # this can be removed.
        with ThreadPool(min(32, os.cpu_count() + 4)) as pool:
            pool.map(process_cell, cell_vcf_paths)

    print("Running germline filter...")
    if processes > 1:
        with Pool(processes) as pool:
            list(
                tqdm(pool.imap(process_patient, all_patient_ids),
                     total=len(all_patient_ids),
                     smoothing=0.01))
    else:
        list(map(process_patient, tqdm(all_patient_ids, smoothing=0.1)))

    print("Done!")

Beispiel #21

0

Datei anzeigen


def use_smac(emotion):
    scenario = Scenario({
        'run_obj': 'quality',
        'runcount-limit': 200,
        "cs": make_cs(),
        "deterministic": "true",
        "shared_model": True,
        "input_psmac_dirs": "smac3-output*",
        "seed": np.random.RandomState()
    })
    smac = SMAC(scenario=scenario,
                rng=np.random.RandomState(42),
                tae_runner=forest_from_cfg)
    incumbent = smac.optimize()
    # joblib.dump(RandomForestClassifier(**incumbent), '{0}_smac_optimized_random_forest.pkl'.format(emotion))
    inc_value = forest_from_cfg(incumbent, emotion)
    out_writer.write("Optimized Value for {0}: {1}".format(emotion, inc_value))
    out_writer.write('\n' + '\n')
    out_writer.write(incumbent)


if __name__ == '__main__':
    OpenDir = sys.argv[sys.argv.index('-d') + 1]
    os.chdir(OpenDir)
    print("Optimizing")
    out_file = 'smac.txt'
    with open(out_file) as out_writer:
        Pool(len(emotion_list())).map(use_smac, emotion_list())

Beispiel #22

0

Datei anzeigen

def _power_bandwidth_variance(spectral_data, l0, dl, w, ncores=8):
    '''
    Calculated the power bandwidth optimization parameter for the given spectral data.

    Args:
    spectral_data : the spectrum to calculate Delta^op for, two columns, first column is
        wavelength, second column is normalized spectral data. Will calculate Delta^op over
        the whole range of the spectrum. Spectrum should be fairly free of noise, filter
        noisy data first.
    w : a numpy array containing the peak widths to calculate
    lstep : the resolution of lambda_0 in nm, default 2 nm
    dlmin : the minimum Delta lambda to calculate
    dlmax :the maximum Delta lambda to calculate
    dlN : the number of values to calculate Delta lambda for

    Returns:
    An array with parameters and calculated values in the form:
        [l0, dl, w, Delta^op]
        Where l0, dl, w are all parameters, and Delta^op is the power bandwidth optimization
        parameter respectively as numpy arrays with dl for rows, l0 for columns and w for
        the third axis (2D array is only one values of w is given)
    '''
    sx = spectral_data[:, 0]
    sy = spectral_data[:, 1]

    cols = len(l0)
    rows = len(dl)
    N = len(w)

    ua = np.zeros((rows, cols, N))
    ub = np.zeros((rows, cols, N))
    du = np.zeros((rows, cols, N))

    spectrum = interp1d(sx,
                        sy,
                        kind='cubic',
                        bounds_error=False,
                        fill_value=np.min(sy))
    args_array = []
    for i in range(rows):
        args_array.append([])
        for j in range(cols):
            args_array[i].append([l0[j], dl[i]])

    int = _integrator(w[0], spectrum)

    # Loop over values of w
    t0 = timer()

    for i in range(N):
        ts = timer()
        int.setw(w[i])
        ua[:, :, i] = _multiprocess2D(int.ua_integral,
                                      args_array,
                                      ncores=ncores,
                                      display=False)
        ub[:, :, i] = _multiprocess2D(int.ub_integral,
                                      args_array,
                                      ncores=ncores,
                                      display=False)
        for ii in range(rows):
            for jj in range(cols):
                du[ii, jj, i] = np.abs(ua[ii, jj, i] - ub[ii, jj, i])
        tf = timer()
        _print(
            str(i + 1) + '/' + str(N) + ' complete ' +
            str(datetime.timedelta(seconds=tf - ts)))
    Pool(nodes=ncores).clear(
    )  # Because pathos is designed to leave Pools running, and sometimes doesn't get rid of them after the caluculation is complete
    _print('Calculations Complete in ' +
           str(datetime.timedelta(seconds=tf - t0)))
    return [l0, dl, w, du]

Beispiel #23

0

Datei anzeigen

    def fit(self, X, y):
        """Extract shapelets from the provided timeseries and labels.

        Parameters
        ----------
        X : array-like, shape = [n_ts, ]
            The training input timeseries. Each timeseries must be an array,
            but the lengths can be variable

        y : array-like, shape = [n_samples]
            The target values.
        """
        # If y is a 1D list, convert it to a 2D column np array
        if type(y) is list or len(y.shape) == 1:
            y = np.reshape(y, (-1, 1))

        # Sci-kit learn checks
        check_array(X)
        check_array(y)

        # Determine the minimum and maximum shapelet length
        min_len = 4
        max_len = min([len(x) for x in X])

        # We will try to maximize the negative logloss of LR in CV.
        # In the case of ties, we pick the one with least number of shapelets
        weights = (1.0, -1.0)
        creator.create("FitnessMax", base.Fitness, weights=weights)

        # Individual are lists (of shapelets (list))
        creator.create("Individual", list, fitness=creator.FitnessMax)

        def random_shapelet(n_shapelets):
            """Extract a random subseries from the training set"""
            shaps = []
            for _ in range(n_shapelets):
                rand_row = np.random.randint(X.shape[0])
                rand_length = np.random.randint(min_len, max_len)
                rand_col = np.random.randint(X.shape[1] - rand_length)
                shaps.append(X[rand_row, rand_col:rand_col + rand_length])
            if n_shapelets > 1:
                return np.array(shaps)
            else:
                return np.array(shaps[0])

        def motif(n_shapelets, n_draw=100):
            """Extract some motifs from sampled timeseries"""
            shaps = []
            for _ in range(n_shapelets):
                rand_length = np.random.randint(min_len, max_len)
                subset_idx = np.random.choice(range(len(X)),
                                              size=n_draw,
                                              replace=True)
                ts = X[subset_idx, :].flatten()
                matrix_profile, _ = mstamp_stomp(ts, rand_length)
                motif_idx = matrix_profile[0, :].argsort()[-1]
                shaps.append(ts[motif_idx:motif_idx + rand_length])
            if n_shapelets > 1:
                return np.array(shaps)
            else:
                return np.array(shaps[0])

        def kmeans(n_shapelets, shp_len, n_draw=1000):
            """Sample subseries from the timeseries and apply K-Means on them"""
            # Sample `n_draw` subseries of length `shp_len`
            n_ts, sz = X.shape
            indices_ts = np.random.choice(n_ts, size=n_draw, replace=True)
            start_idx = np.random.choice(sz - shp_len + 1,
                                         size=n_draw,
                                         replace=True)
            end_idx = start_idx + shp_len

            subseries = np.zeros((n_draw, shp_len))
            for i in range(n_draw):
                subseries[i] = X[indices_ts[i], start_idx[i]:end_idx[i]]

            tskm = TimeSeriesKMeans(n_clusters=n_shapelets,
                                    metric="euclidean",
                                    verbose=False)
            return tskm.fit(subseries).cluster_centers_[0]

        def create_individual(n_shapelets=None):
            """ Generate a random shapelet set """
            if n_shapelets is None:
                n_shapelets = 1

            rand = np.random.random()
            if rand < 1. / 3.:
                return [motif(n_shapelets)]
            elif 1. / 3. < rand < 2. / 3.:
                return [
                    kmeans(n_shapelets, np.random.randint(min_len, max_len))
                ]
            else:
                return [random_shapelet(n_shapelets)]

        def cost(shapelets):
            """ Calculate the fitness of an individual/shapelet set"""
            start = time.time()
            D = np.zeros((len(X), len(shapelets)))
            for k in range(len(X)):
                ts = X[k, :]
                for j in range(len(shapelets)):
                    if self.normed:
                        dist = util.sdist(shapelets[j].flatten(), ts)
                    else:
                        dist = util.sdist_no_norm(shapelets[j].flatten(), ts)
                    D[k, j] = dist

            lr = LogisticRegression()
            skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=1337)
            preds = cross_val_predict(lr, D, y, method='predict_proba', cv=skf)
            cv_score = -log_loss(y, preds)

            return (cv_score, sum([len(x) for x in shapelets]))

        def add_noise(shapelets):
            """Add random noise to a random shapelet"""
            rand_shapelet = np.random.randint(len(shapelets))
            tools.mutGaussian(shapelets[rand_shapelet],
                              mu=0,
                              sigma=0.1,
                              indpb=0.15)

            return shapelets,

        def remove_shapelet(shapelets):
            """Remove a random shapelet from the individual"""
            if len(shapelets) > 1:
                rand_shapelet = np.random.randint(len(shapelets))
                shapelets.pop(rand_shapelet)

            return shapelets,

        def merge_crossover(ind1, ind2):
            """ Merge shapelets from one set with shapelets from the other """
            # Construct a pairwise similarity matrix using GAK
            _all = list(ind1) + list(ind2)
            similarity_matrix = cdist_gak(ind1, ind2, sigma=sigma_gak(_all))

            # Iterate over shapelets in `ind1` and merge them with shapelets
            # from `ind2`
            for row_idx in range(similarity_matrix.shape[0]):
                # Remove all elements equal to 1.0
                mask = similarity_matrix[row_idx, :] != 1.0
                non_equals = similarity_matrix[row_idx, :][mask]
                if len(non_equals):
                    # Get the timeseries most similar to the one at row_idx
                    max_col_idx = np.argmax(non_equals)
                    ts1 = list(ind1[row_idx]).copy()
                    ts2 = list(ind2[max_col_idx]).copy()
                    # Merge them and remove nans
                    ind1[row_idx] = euclidean_barycenter([ts1, ts2])
                    ind1[row_idx] = ind1[row_idx][~np.isnan(ind1[row_idx])]

            # Apply the same for the elements in ind2
            for col_idx in range(similarity_matrix.shape[1]):
                mask = similarity_matrix[:, col_idx] != 1.0
                non_equals = similarity_matrix[:, col_idx][mask]
                if len(non_equals):
                    max_row_idx = np.argmax(non_equals)
                    ts1 = list(ind1[max_row_idx]).copy()
                    ts2 = list(ind2[col_idx]).copy()
                    ind2[col_idx] = euclidean_barycenter([ts1, ts2])
                    ind2[col_idx] = ind2[col_idx][~np.isnan(ind2[col_idx])]

            return ind1, ind2

        def point_crossover(ind1, ind2):
            """ Apply one- or two-point crossover on the shapelet sets """
            if len(ind1) > 1 and len(ind2) > 1:
                if np.random.random() < 0.5:
                    ind1, ind2 = tools.cxOnePoint(list(ind1), list(ind2))
                else:
                    ind1, ind2 = tools.cxTwoPoint(list(ind1), list(ind2))

            return ind1, ind2

        # Register all operations in the toolbox
        toolbox = base.Toolbox()

        if self.n_jobs > 1:
            pool = Pool(self.n_jobs)
            toolbox.register("map", pool.map)
        else:
            toolbox.register("map", map)

        # Register all our operations to the DEAP toolbox
        toolbox.register("merge", merge_crossover)
        toolbox.register("cx", point_crossover)
        toolbox.register("mutate", add_noise)
        toolbox.register("remove", remove_shapelet)
        toolbox.register("individual", tools.initIterate, creator.Individual,
                         create_individual)
        toolbox.register("population", tools.initRepeat, list,
                         toolbox.individual)
        toolbox.register("evaluate", cost)
        # Small tournaments to ensure diversity
        toolbox.register("select", tools.selTournament, tournsize=3)

        # Set up the statistics. We will measure the mean, std dev and max
        stats = tools.Statistics(key=lambda ind: ind.fitness.values[0])
        stats.register("avg", np.mean)
        stats.register("std", np.std)
        stats.register("max", np.max)

        # Initialize the population and calculate their initial fitness values
        pop = toolbox.population(n=self.population_size)
        fitnesses = list(map(toolbox.evaluate, pop))
        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit

        # Keep track of the best iteration, in order to do stop after `wait`
        # generations without improvement
        it, best_it = 1, 1
        best_ind = []
        best_score = float('-inf')

        # Set up a matplotlib figure and set the axes
        height = int(np.ceil(self.population_size / 4))
        if self.plot is not None and self.plot != 'notebook':
            if self.population_size <= 20:
                f, ax = plt.subplots(4, height, sharex=True)
            else:
                plt.figure(figsize=(15, 5))
                plt.xlim([0, len(X[0])])

        # The genetic algorithm starts here
        while it <= self.iterations and it - best_it < self.wait:
            gen_start = time.time()

            # Clone the population into offspring
            offspring = list(map(toolbox.clone, pop))

            # Plot the fittest individual of our population
            if self.plot is not None:
                if self.population_size <= 20:
                    if self.plot == 'notebook':
                        f, ax = plt.subplots(4, height, sharex=True)
                    for ix, ind in enumerate(offspring):
                        ax[ix // height][ix % height].clear()
                        for s in ind:
                            ax[ix // height][ix % height].plot(
                                range(len(s)), s)
                    plt.pause(0.001)
                    if self.plot == 'notebook':
                        plt.show()

                else:
                    plt.clf()
                    for shap in best_ind:
                        plt.plot(range(len(shap)), shap)
                    plt.pause(0.001)

            # Iterate over all individuals and apply CX with certain prob
            for child1, child2 in zip(offspring[::2], offspring[1::2]):
                try:
                    if np.random.random() < self.crossover_prob:
                        toolbox.merge(child1, child2)
                        del child1.fitness.values
                        del child2.fitness.values
                    if np.random.random() < self.crossover_prob:
                        toolbox.cx(child1, child2)
                        del child1.fitness.values
                        del child2.fitness.values
                except:
                    raise

            # Apply mutation to each individual
            for idx, indiv in enumerate(offspring):
                if np.random.random() < self.add_noise_prob:
                    toolbox.mutate(indiv)
                    del indiv.fitness.values
                if np.random.random() < self.remove_shapelet_prob:
                    toolbox.remove(indiv)
                    del indiv.fitness.values

            # Update the fitness values
            invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
            fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
            for ind, fit in zip(invalid_ind, fitnesses):
                ind.fitness.values = fit

            # Replace population and update hall of fame & statistics
            new_pop = toolbox.select(offspring, self.population_size - 1)
            fittest_ind = tools.selBest(pop + offspring, 1)
            pop[:] = new_pop + fittest_ind
            it_stats = stats.compile(pop)

            # Print our statistics
            if self.verbose:
                if it == 1:
                    print('it\t\tavg\t\tstd\t\tmax\t\ttime')
                print('{}\t\t{}\t\t{}\t\t{}\t{}'.format(
                    it,
                    np.around(it_stats['avg'], 4),
                    np.around(it_stats['std'], 3),
                    np.around(it_stats['max'], 6),
                    np.around(time.time() - gen_start, 4),
                ))

            # Have we found a new best score?
            if it_stats['max'] > best_score:
                best_it = it
                best_score = it_stats['max']
                best_ind = tools.selBest(pop + offspring, 1)

            it += 1

        self.shapelets = np.array(best_ind[0])

Beispiel #24

0

Datei anzeigen

    # Make all of the folders for the tile
    for y in range(16):
        for x in range(16):
            os.makedirs(path + "/Tile" + str(tileNum) + "_" + str(y) + "_" +
                        str(x),
                        exist_ok=True)

    # Generate the new chunks
    for file in files:

        # Load image with 50px padding
        img = cv2.imread(file[0], -1)
        img = cv2.copyMakeBorder(img, 50, 50, 50, 50, cv2.BORDER_CONSTANT)

        for y in range(16):
            for x in range(16):
                i = (y * 50, x * 50)

                newImgPath = path + "/Tile" + str(tileNum) + "_" + str(
                    y) + "_" + str(x) + "/" + file[1]
                split = img[i[0]:i[0] + NEWHEIGHT, i[1]:i[1] + NEWWIDTH]
                cv2.imwrite(newImgPath, split)


if CREATE:
    p = Pool(NUMPROCESS)
    results = p.map(lambda a: processImg(a, "/dfc2021_dse_train/Train"),
                    list(range(1, 61)))
    results = p.map(lambda a: processImg(a, "/dfc2021_dse_val/Val"),
                    list(range(1, 20)))

Beispiel #25

0

Datei anzeigen

def get_coin_buy_prices(api_urls: list):
    coins = list(get_coinnames(
    ))  # list changes infrequently, but may occasioonallly cause problems
    exchangerates = {}
    unfinished = [
    ]  # urls that werent attempted due to a remote server disconnect
    blacklist = []
    results = []

    # TODO - Work on a proper add_coin_price progress algo
    def add_coin_price(url):
        exchangerate = ''  #declared up a scope!
        # i = 0
        base = url[str(url).rfind('/') + 1:str(url).rfind(
            '-')]  # rfinds start from 0 quirk to the rescue!
        try:
            # i += 1
            print(datetime.datetime.now(), 'Progress ', 'url is', url)
            cachepath = 'cache/' + url[url.rfind('/') + 1:]
            with open(cachepath) as cache:
                print('Using Cache', cachepath)
                try:
                    exchangerate = json.loads(cache.read().decode())
                except AttributeError as isstr:
                    exchangerate = json.loads(cache.read())

                # exchangerate = json.loads(cache.read() if type(cache.read()) == str() else cache.read().decode() ) if cache.readable() else scraper.get(url).json()
                exchangerate = exchangerate['ticker']['buy']
                print('Base Is', base, 'Exchangerate is', exchangerate)
        except JSONDecodeError:
            exchangerate = 0.0
            return {'blacklist': url}
        """
        except (RemoteDisconnected,ProtocolError,ConnectionError): # error cascade below! argh!
            # possible subexceptions go here.
            try:
                exchangerate = 0.0
                return {'TODO':url}
            except ProtocolError:
                try:
                    exchangerate = 0.0
                    return {'TODO': url}
                except ConnectionError:
                    try:
                        exchangerate = 0.0
                        return {'TODO': url}
                    except:
                        try:
                            exchangerate = 0.0
                            return {'TODO': url}
                        except Exception as e:
                            exchangerate = 0.0
                            return {'TODO': url}
        """

        return {str(base): [{str(url): float(exchangerate)}]}

    with Pool() as p:  # parse the urls for prices
        results = p.map(add_coin_price, api_urls)
        # exchangerates = dict(ChainMap(*results))# *results

    print(results)

    # build the json file, by collating the "results" list of dicts into a json dict by key.
    def collate_by_coin(dict_key):
        flattened_dict = {dict_key: []}
        for result in results:
            try:
                if result[dict_key]:
                    pair = str(result.values[0])[
                        str(result.values[0]).rfind('-') +
                        1:str(result.values[0]).rfind('.') -
                        1]  # get the crossex pair from a url.
                    flattened_dict[dict_key].append({pair: result.values()[1]})
            except Exception as e:  # key error .. continue to next iteration
                continue
        return dict(flattened_dict)

    def collate_by_unfinished(result: dict):
        try:
            # print('Unfinished Run Is',result) # uncomment for debug
            if 'TODO' in result:
                print(result)
                return result['TODO']
        except Exception as e:
            print('unfinished result is', result)
            print(e.with_traceback())

    def collate_by_blacklist(result: dict):
        try:
            # print('Unfinished Run Is',result) # uncomment for debug
            if 'blacklist' in result:
                print(result)
                return result['blacklist']
        except Exception as e:
            print('blacklist result is', result)
            print(e.with_traceback())

    # TODO - finish ccex.json
    with Pool() as p:
        # coins = list(dict(ChainMap(*results)).keys())
        exchangerateslist = p.map(collate_by_coin, coins)
        for exchangeratedict in exchangerateslist:
            exchangerates.update(exchangeratedict)

    # Collate By Unfinished
    with Pool() as p:
        unfinished = p.map(collate_by_unfinished, results)

    #   def
    with Pool() as p:
        blacklist = p.map(collate_by_blacklist, results)

    return {
        'exchangerates': exchangerates,
        'unfinished': unfinished,
        'blacklist': blacklist
    }

Beispiel #26

0

Datei anzeigen

def main():
    time0 = time.time()
    parser = ArgumentParser()
    parser.add_argument(
        '--years',
        dest='s_years',
        action='store',
        type=str,
        help='Give a list of years as a string, such as "1980,1981". Optional.'
    )
    parser.add_argument('--local',
                        dest='do_local',
                        action='store_true',
                        default=False,
                        help='Check for locally running plex server.')
    parser.add_argument(
        '--dirname',
        dest='dirname',
        action='store',
        type=str,
        default=os.getcwd(),
        help='Directory into which to store those plots. Default is %s.' %
        os.getcwd())
    parser.add_argument('--noverify',
                        dest='do_verify',
                        action='store_false',
                        default=True,
                        help='If chosen, do not verify SSL connections.')
    args = parser.parse_args()
    #
    ## function to do the processing
    step = 0
    print('%d, started on %s' %
          (step, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p')))
    if args.s_years is not None:
        try:
            years = sorted(
                set(map(lambda tok: int(tok), args.s_years.split(','))))
        except:
            step += 1
            print('%d, did not give a valid set of years.' % step)
            years = []
    else:
        years = []

    #
    ## get plex server token
    dat = core.checkServerCredentials(doLocal=args.do_local,
                                      verify=args.do_verify)
    if dat is None:
        step += 1
        print('\n'.join([
            '%d, error, could not access local Plex server in %0.3f seconds. Exiting...'
            % (step, time.time() - time0),
            '%d, finished on %s.' %
            (step + 1,
             datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p'))
        ]))
        return
    fullURL, token = dat
    #
    ## first find out which libraries are the TV show ones
    library_dict = core.get_libraries(token, fullURL=fullURL, do_full=True)
    if library_dict is None:
        step += 1
        print('\n'.join([
            '%d, error, could not access libraries in plex server in %0.3f seconds. Exiting...'
            % (step, time.time() - time0),
            '%d, finished on %s.' %
            (step + 1,
             datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p'))
        ]))
        return
    #
    valid_keys = list(
        filter(lambda key: library_dict[key][-1] == 'show', library_dict))
    if len(valid_keys) == 0:
        step += 1
        print('\n'.join([
            '%d, Error, could not find a TV show library in %0.3f seconds. Exiting...'
            % (time.time() - time0, step),
            '%d, finished on %s.' %
            (step + 1,
             datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p'))
        ]))
        return
    tvlib_title = library_dict[max(valid_keys)][0]
    step += 1
    print('%d, found TV library: %s.' % (step, tvlib_title))
    #
    ## now get the TV shows
    tvdata = core.get_library_data(tvlib_title,
                                   token=token,
                                   fullURL=fullURL,
                                   num_threads=16)
    showsToExclude = tv.get_shows_to_exclude(tvdata)
    if len(showsToExclude) != 0:
        step += 1
        print('%d, excluding these TV shows: %s.' %
              (step, '; '.join(showsToExclude)))

    #
    ## now actual meat of the computation
    tvdata_date_dict = tv.get_tvdata_ordered_by_date(tvdata)
    min_year = min(tvdata_date_dict.keys()).year
    max_year = max(tvdata_date_dict.keys()).year
    possible_years_set = set(map(lambda date: date.year, tvdata_date_dict))
    step += 1
    if len(years) == 0:
        years = sorted(possible_years_set)
        print('%d, no years specified. We will use %s total: %s.' %
              (step, _print_years(len(years)), ', '.join(
                  map(lambda year: '%d' % year, years))))
    else:
        cand_years = sorted(set(years) & possible_years_set)
        if len(cand_years) == 0:
            print('\n'.join([
                '%d, no intersection between the %s chosen (%s) and the %d years in the library.'
                % (step, _print_years(len(years)), ', '.join(
                    lambda yr: '%d' % year, years), len(possible_years_set)),
                'Instead, we will use %s total: %s.' %
                (_print_years(len(possible_years_set)), ', '.join(
                    map(lambda year: '%d' % year, sorted(possible_years_set))))
            ]))
            years = sorted(possible_years_set)
        else:
            print('%d, we found %s to use: %s.' %
                  (step, _print_years(len(cand_years)), ', '.join(
                      map(lambda year: '%d' % year, cand_years))))
            years = cand_years

    step += 1
    print('%d, started processing %s of TV shows after %0.3f seconds.' %
          (step, _print_years(len(years)), time.time() - time0))
    manager = Manager()
    shared_step = manager.Value('step', step)
    num_procced = manager.Value('nump', 0)
    lock = manager.RLock()
    pool = Pool(processes=cpu_count())

    def _process_year(year):
        tv.create_plot_year_tvdata(tvdata_date_dict,
                                   year,
                                   shouldPlot=True,
                                   dirname=args.dirname)
        lock.acquire()
        shared_step.value += 1
        num_procced.value += 1
        print(
            '%d, finished processing year = %d (%02d / %02d) in %0.3f seconds.'
            % (shared_step.value, year, num_procced.value, len(years),
               time.time() - time0))
        lock.release()

    _ = list(pool.map(_process_year, years))
    step = shared_step.value + 1
    print('\n'.join([
        '%d, processed all %s in %0.3f seconds.' %
        (step, _print_years(len(years)), time.time() - time0),
        '%d, finished everything on %s.' %
        (step + 1, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p'))
    ]))

Beispiel #27

0

Datei anzeigen

    else:
        return None
    

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--scanner', dest='scanner', action='store', nargs=1, default=None,
        help='Run N4 correction on flair and t1 images from specific scanner (mg or uc)')
    args = parser.parse_args()
    scanner=args.scanner[0]

    scan_keys = get_scans('subjects')
    if scanner == 'mg':
        commands = [bias_correct(scan_key + '/FLAIR-corrected.nii.gz',
            scanner, scan_key) for scan_key in scan_keys]
        commands.extend([bias_correct(scan_key + '/MPRAGE-corrected-reorn.nii.gz', 
            scanner, scan_key) for scan_key in scan_keys])
    elif scanner == 'uc':
        commands = [bias_correct('/home/shengwei/work/vbm/uc/flair/raw/' + scan_key + '.nii.gz',
            scanner, scan_key) for scan_key in scan_keys]
        commands.extend([bias_correct('/media/shengwei/BackupData/fmri/uc/150908/' \
            + scan_key + '/t1-reorient.nii.gz', scanner, scan_key) for scan_key in scan_keys])
    else:
        print('Type -h for usage, exiting...')
        exit(1)

    with Pool(cpu_count()-2) as pool:
        for _ in tqdm(pool.imap(system, commands), total = len(commands)):
            pass

Beispiel #28

0

Datei anzeigen

def ks_sampling_mem(X, seed=None, n_result=None, n_proc=4, n_batch=1000):
    """
    ks_sampling_mem(X, seed=None, n_result=None, n_proc=4, n_batch=1000)

    Kennard-Stone Full Sampling Program
        (with limited memory)

    If user have enough memory space, using `ks_sampling`
    instead of `ks_sampling_mem` is strongly recommended.
    
    This program could possibly handle very large dataset.
    To make memory cost as low as possible, `n_batch` could
    be set to about sqrt(X.shape[0]) manually.
    However, to make efficiency as the first priority,
    `n_batch` could be set to as large as possible.
    
    NOTE! Only Euclid distance is available currently!

    Parameters
    ----------

    X: np.ndarray, shape: (n_sample, n_feature)
        Original data, need to be generated by user.

    seed: np.ndarray or list or None, shape: (n_seed, ), optional
        Initial selected seed.
        If set as `None`, the program will find the two samples
        which have largest distance as seed.

    n_result: int or None, optional
        Number of samples that should be selected.
        If set as `None`, `n_sample` will be used instead, i.e.
        selectet all data.
    
    n_proc: int, optional
        Number of Python's multiprocessing processors.
        NOTE! This variable only controls Python's code.
        NOTE! Only used in finding maximum distance! Not in KS sampling.
    
    n_batch: int, optional
        The dimension of distance matrix evaluation in one processor.
    """
    X = np.asarray(X, dtype=float)
    n_sample = X.shape[0]
    if n_result is None:
        n_result = X.shape[0]
    # Find most distant sample indexes if no seed provided
    if seed is None or len(seed) == 0:
        t = np.einsum("ia, ia -> i", X, X)

        def get_dist_slice(sliceA, sliceB):
            distAB = t[sliceA, None] - 2 * X[sliceA] @ X[sliceB].T + t[None,
                                                                       sliceB]
            if sliceA == sliceB:
                np.fill_diagonal(distAB, 0)
            return np.sqrt(distAB)

        def get_maxloc_slice(slice_pair):
            dist_slice = get_dist_slice(slice_pair[0], slice_pair[1])
            max_indexes = np.unravel_index(np.argmax(dist_slice),
                                           dist_slice.shape)
            return (dist_slice[max_indexes],
                    max_indexes[0] + slice_pair[0].start,
                    max_indexes[1] + slice_pair[1].start)

        p = list(np.arange(0, n_sample, n_batch)) + [n_sample]
        slices = [slice(p[i], p[i + 1]) for i in range(len(p) - 1)]
        slice_pairs = [(slices[i], slices[j]) for i in range(len(slices))
                       for j in range(len(slices)) if i <= j]

        with Pool(n_proc) as p:
            maxloc_slice_list = p.map(get_maxloc_slice, slice_pairs)
        max_indexes = maxloc_slice_list[np.argmax(
            [v[0] for v in maxloc_slice_list])][1:]
        seed = max_indexes
    seed = np.asarray(seed, dtype=np.uintp)

    return ks_sampling_mem_core(X, seed, n_result)

Beispiel #29

0

Datei anzeigen


def prop_exploit_scrobbles(fi):
    blocks = pd.read_pickle(fi)['block']
    cnts = pd.DataFrame({'n':blocks.value_counts().sort_index()})
    cnts['last-n'] = cnts['n'].shift(1)
    cnts['switch'] = cnts.apply(lambda row: 1 if ((row['last-n']==1) and (row['n']>1)) or ((row['last-n']>1) and (row['n']==1)) else 0,axis=1)
    cnts['exp-idx'] = cnts['switch'].cumsum()
    result = cnts.groupby('exp-idx').apply(lambda grp: pd.Series({'n':len(grp),'exploit':0}) if grp['n'].iloc[0]==1 else pd.Series({'n':grp['n'].sum(),'exploit':1}))
    exploit = result[result['exploit']==1]['n'].sum()
    explore = result[result['exploit']==0]['n'].sum()
    return (explore,exploit)



pool = Pool(cpu_count())
start = time.time()
result_patches = pool.map(prop_exploit_patches,files_patches)
print (time.time() - start)/60.
start = time.time()
result_scrobbles = pool.map(prop_exploit_scrobbles,files_scrobbles)
print (time.time() - start)/60.

result_patches = np.vstack(result_patches)
result_scrobbles = np.vstack(result_scrobbles)

np.save('ee_count_patches.npy',result_patches)
np.save('ee_count_scrobbles.npy',result_scrobbles)

Beispiel #30

0

Datei anzeigen

Datei: goodreads_list_search.py Projekt: lucas110/goodreads-list-properties

    list_link = goodreads_link + list_name
    total_pages = get_last_page_num(list_link)
    p = 165

    book_db_file = "goodreads_list_props.csv"
    #os.remove(book_ratings_file_name)
    if not os.path.exists(book_db_file):
        with open(book_db_file, 'w') as f:
            f.write(
                "book_name,author,rating,votes,description,book_type,no_of_pages,first_published,isbn13,genre,link\n"
            )
    book_ratings_db = pd.read_csv(book_db_file, sep=",", quotechar="\"")

    for p in range(410, total_pages):
        page_id = '' if p == 0 else "?page=" + str(p + 1)
        current_link = list_link + page_id
        print(current_link)
        all_links = request_and_find_type(current_link, "a")
        all_books = list(
            set(search_for_text(all_links, "\"(/book/show/.*?)\"")))
        all_book_links = ["https://www.goodreads.com/" + x for x in all_books]
        # [process_book(x, book_ratings_db, book_db_file) for x in all_book_links]

        pool = Pool(4)
        list(
            pool.map(lambda x: process_book(x, book_ratings_db, book_db_file),
                     all_book_links))
        # list(map(lambda x: process_book(x, book_ratings_db, book_db_file), all_book_links))

current_book_link = search_string = "https://www.goodreads.com//book/show/20578795-meditation-as-a-way-of-life"