コード例 #1
0
def GroupByParallelProcess(tweetsDF, cores, groupMethod):
    """
    Group by and aggregate on time via a parallel process
    """

    tweetsDF.label_date = tweetsDF.label_date.astype(int)
    tweetsDF = tweetsDF.set_index("label_date")
    # Parallelizing using Pool.apply()
    df_split = GetListOfSplitDFs(tweetsDF, cores)
    # create the multiprocessing pool
    pool = Pool(cores)
    # process the DataFrame by mapping function to each df across the pool
    logging.info("Starting the grouping and aggregating process.")
    if groupMethod == "weighted-average":
        df_out = pool.map(PerformGroupbyAndAggregate, df_split)
    elif groupMethod == "sum":
        df_out = pool.map(PerformSum, df_split)
    elif groupMethod == "mean":
        df_out = pool.map(PerformMean, df_split)
    else:
        logging.error("Choose correct group by method.")
        return None

    # close down the pool and join
    pool.close()
    pool.join()
    pool.clear()

    logging.info("Ended the grouping and aggregating process.")

    return df_out
コード例 #2
0
def q5_plot_chromatic_num_bounds_by_prob(n, prange, pstep, k=None,\
    clique_finder=greedy_find_clique_number, multi=False):
    """Plots a graph of number of colours against edge probability,
    for each of the various lower/upper bounds of chromatic number
    multi: True/False/int multiprocessing - yes/no/ num processes (default 4 if true)
    """
    probs = np.arange(prange[0], prange[1], pstep)
    graphs = [[get_random_graph(n, p, k) for _ in range(10)] for p in probs]
    mean_bounds = []
    pool = Pool(multi if type(multi) is int else 4)
    # graph_generator = pool.imap(multiprocessing_chrom_bounds_func, graphs) if multi else map(f, graphs)
    f = lambda graphs_list: list(map(get_chromatic_number_bounds, graphs_list))
    graph_generator = pool.imap(f, graphs) if multi else map(f, graphs)

    for bounds in tqdm.tqdm(graph_generator, total=len(graphs)):
        mean_bounds.append(np.mean(bounds, axis=0))

    pool.close()
    pool.join()

    mean_bounds = np.array(mean_bounds)
    plt.figure()
    for i, label in zip(range(mean_bounds.shape[1]), \
        ['lb_comp', 'lb_clique', 'ub_clique', 'ub_greedy_rand', 'ub_greedy_msd']):
        plt.plot(probs, mean_bounds[:, i], label=label)
    plt.legend()

    return probs, mean_bounds
コード例 #3
0
ファイル: data.py プロジェクト: MunKeat/cs4224f-mongodb
 def preprocess(self):
     # Check if orderline should be extracted
     extract_orderline = conf['extract_orderline']
     pool = Pool()
     start = time.time()
     # Run in parallel
     if extract_orderline:
         res_orderline = pool.apipe(self.create_orderline,
                                    return_dataframe=False)
     res_warehouse = pool.apipe(self.create_warehouse)
     res_district = pool.apipe(self.create_district)
     res_order = pool.apipe(self.create_order)
     res_customer = pool.apipe(self.create_customer)
     res_stock = pool.apipe(self.create_stock)
     # Consolidate result
     pool.close()
     pool.join()
     list_of_processed_files = [res_warehouse.get(), res_district.get(),
                                res_order.get(), res_customer.get(),
                                res_stock.get()]
     if extract_orderline:
         list_of_processed_files.append(res_orderline.get())
     end = time.time()
     self.debug("Preprocessing of csv file took {}s".format(end - start))
     return list_of_processed_files
コード例 #4
0
def perplexity(lang="eng"):
    """
    finds satistical perplexity of the language model in Google Books N-Gram
    dataset.
    """
    pool = ProcessingPool(4)
    unigram_counter, mgram_counter, ngram_counter= pool.map(get_ngram_counter,
                                              [1,2,3],
                                              [lang] * 3)
    pool.close()
    pool.join()

    total_words = np.sum(np.array(list(unigram_counter.values())))
    print("total_words = ", total_words)

    ngram_conditionals = get_ngram_conditionals(ngram_counter,
                                            mgram_counter)

    probs = np.power(np.array(list(ngram_conditionals.values()),
                              dtype=np.float64),
             -np.array(list(ngram_counter.values()), dtype=np.float64) \
             / total_words)

    print("probs shape = ", probs.shape)

    PP = (np.prod(probs, dtype=np.float64))

    return PP
コード例 #5
0
    def multi_Non_Tweep_friends(self, handle):
        min_position, links = self.get_tweets(handle)
        print("Scraping last 100 days of activity")

        while (True):
            min_position1, links1 = self.get_tweets(handle, min_position)
            links = links + links1
            if (min_position1 == None):
                break
            min_position = min_position1

        people_list = []

        link = [x for x in links if handle in x]
        link = self.duplicates(link)

        p = Pool(10)  # Pool tells how many at a time
        with Pool(10) as p:

            records = list(tqdm(p.imap(self.get_people, link),
                                total=len(link)))
            p.terminate()
            p.join()
            p.close()
            people_list = [item for sublist in records for item in sublist]
            people_list = self.duplicates(people_list)

        people_list = [x for x in people_list if x != handle]

        return (people_list)
コード例 #6
0
    def multi_word_cut(self, sentences):
        print('Multiprocessing Word cut ')
        if self.language == 'ch':
            jieba.initialize(
            )  # initialize first, or it will initialize in each process
            jieba.disable_parallel()

            def func(line):
                line = [i.strip() for i in jieba.cut(line, cut_all=False)]
                return [
                    i for i in line
                    if ((not i.isdigit()) and (i not in self.stop_words))
                ]
        else:

            def func(line):
                return [i.lower() for i in line.split(" ") if ((not i.isdigit()) and \
                                                       (i not in self.stop_words) and \
                                                       (len(i) >1 ) )]

        pool = Pool(nodes=5)
        t0 = time.time()
        word_cut = pool.map(func, sentences)
        pool.close()
        pool.join()
        pool.clear()
        print('MultiProcess  time {:.0f}'.format(time.time() - t0))
        return word_cut
コード例 #7
0
ファイル: submission.py プロジェクト: ruanchaves/BERT-WS
def parallelize_dataframe(df, func, n_cores=16):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df
コード例 #8
0
ファイル: minnie.py プロジェクト: JoaoRodrigues/minnie
def findbonds(self):
    """Calculates interactions between and/or within monomers"""
    if self.help:
        print(
            "Calculates interactions between and/or within monomers\n"
            f'\n\033[1mUsage: minnie findbonds \n'
            f'                        -cn, --complexName     <string>     \n '
            f'                                               Project ID of your complex\n\n'
            f'                        -p, --pdbs             [<.pdb>/<path>] (singleframe.pdb)   \n'
            f'                                               Give single *.pdb or give folder path \n\n'
            f'                        -i                     [<hbonds>/<ionic>/<hydrophobic>/<ring_stacking>/<all>] (hbonds)    \n'
            f'                                               Calculates which types of interactions \n\n'
            f'                        -d                      <float> (2.5)                 \n'
            f'                                               Cut-off to define a hydrogen bond\n\n'
            f'                        -intra, --includeIntra [<"True">/<"False">] ("False")  \n'
            f'                                               What do you want to analyze, all or only inter-monomer contacts? \033[0m \n\n\n\n'
            f'\n\033[1mUsage example:\033[0m\n\n'
            " Single frame    - minnie findbonds -cn sox4 -p sox4/02_frames/md_0.pdb -i hbonds  -s False  \n"
            " Multiple frames - minnie findbonds -cn sox4 -p sox4/02_frames/* -i hbonds \n"
            " Multiple frames - minnie findbonds -cn sox4 -p sox4/02_frames/* -i all \n"
        )
    elif not self.pdbs:
        print(f'where is pdb??')
    elif not self.complexName:
        print(f'Please specify complex name(s)')

    elif (self.systematic) == "True":
        pdb_list = self.pdbs
        if (self.intType == "all"):
            for intType in ["hbonds", "ionic", "hydrophobic", "ring_stacking"]:
                pool = Pool(pathos.multiprocessing.cpu_count() - 2)
                pool.map(analysis.comb_int, pdb_list,
                         len(pdb_list) * [str(self.complexName)],
                         len(pdb_list) * [str(intType)],
                         len(pdb_list) * [str(self.includeIntra)],
                         len(pdb_list) * [str(self.hbond_distance)])
                #pool.close()

        else:
            pool = pathos.multiprocessing.ProcessingPool(
                pathos.multiprocessing.cpu_count() - 2)
            pool.map(analysis.comb_int, pdb_list,
                     len(pdb_list) * [str(self.complexName)],
                     len(pdb_list) * [str(self.intType)],
                     len(pdb_list) * [str(self.includeIntra)],
                     len(pdb_list) * [str(self.hbond_distance)])
            pool.close()
        analysis.combine_interfacea_results(self.complexName)
    elif (self.systematic) == "False":
        if (self.intType == "all"):
            for intType in ["hbonds", "ionic", "hydrophobic", "ring_stacking"]:
                analysis.comb_int(self.pdbs[0], self.complexName, intType,
                                  self.includeIntra, self.hbond_distance)
        else:
            analysis.comb_int(self.pdbs[0], self.complexName, self.intType,
                              self.includeIntra, self.hbond_distance)

        analysis.combine_interfacea_results(self.complexName)
コード例 #9
0
def test_multiprocess():
    x_list = [1,2,3,4,5,6,7,]
    y_list = ['1','2','3','4','5','6','7']
    epoch = 8
    pool = Pool(epoch)
    res = pool.amap(test_task,x_list,y_list)
    pool.pipe(test_task,'22','222')
    pool.close()
    pool.join()
コード例 #10
0
class ConsensusMHSampler(MHSampler):
	def __init__(self, log_f, log_g, g_sample, x0, iterations, shards=1):
		super(ConsensusMHSampler, self).__init__(log_f, log_g, g_sample, x0, iterations)
		self.shards = shards

		assert len(self.log_distribution_fn) == self.shards
		self.log_fn_dict = {} # for pickling purposes
		for i in range(self.shards):
			self.log_fn_dict[i] = self.log_distribution_fn[i]

		self.pool = Pool(nodes=self.shards)

	def sample(self):
		map_results = self.pool.map(self.map_sample, range(self.shards))
		self.pool.close()
		self.pool.join()
		self.pool.terminate()
		self.pool.restart()
		self.saved_states = self.reduce_sample(map_results)

	def map_sample(self, index):
		np.random.seed(1)
		cur_state = self.start_state
		sample_results = [cur_state]
		prob, count = 0, 0

		for i in range(self.iterations):
			if i % 5000 == 0:
				print("iteration {}".format(i))
			candidate_state = self.get_transition_sample(cur_state)
			acceptance = self.calculate_acceptance_ratio(candidate_state, self.log_fn_dict[index])
			prob += acceptance
			count += 1

			new_state = self.transition_step(cur_state, candidate_state, acceptance)
			sample_results.append(new_state)
			cur_state = new_state
		sample_results = np.array(sample_results)

		print("INDEX {}: Avg acceptance prob is {}".format(index, prob/count))

		return (sample_results, 1.0 / (1e-8 + self.get_sample_variance(sample_results)))

	def get_sample_variance(self, data):
		return np.linalg.norm(np.var(np.array(data), axis=0))

	def reduce_sample(self, results):
		'''
			results is a list of (sample_array, weight) tuples
		'''
		sample_results = 0
		total_weight = 0
		for sample, weight in results:
			sample_results += weight * sample
			total_weight += weight

		return sample_results / total_weight
コード例 #11
0
def parallelize_dataframe(df: pd.DataFrame, func, n_cores=4) -> pd.DataFrame:
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    # have to include this to prevent leakage and allow multiple parallel function calls
    pool.terminate()
    pool.restart()
    return df
コード例 #12
0
def goo():
    pool = Pool(4)
    #    def f(x):
    #        return foo(100 + x)
    stuff = list(tqdm.tqdm(pool.imap(foo, range(20)), total=20))
    print(stuff)
    print('aaa')
    pool.close()
    pool.join()
    print('bbb')
コード例 #13
0
ファイル: spatial_utils.py プロジェクト: bubalis/VT_pIndex
def parallelize(data, func, num_of_processes=8):
    '''Function for paralellizing any function on a dataframe.
    Stolen from stack overflow, user Tom Raz:
    https://stackoverflow.com/questions/26784164/pandas-multiprocessing-apply'''
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data
コード例 #14
0
ファイル: stats.py プロジェクト: prashjet/dynclust
def get_jsd_gmm(gmmfit, savefile=None, multiprocessing=False, n_pool=10):

    n_idx = len(gmmfit)
    idx = np.arange(n_idx)
    labels = gmmfit['label']
    i_list, i_label = [], []
    j_list, j_label = [], []
    for i in range(n_idx):
        i_list = i_list + list(np.repeat(int(idx[i]), n_idx - i - 1))
        i_label = i_label + list(np.repeat(labels[i], n_idx - i - 1))
        if i < n_idx:
            j_list = j_list + list(idx[i + 1::])
            j_label = j_label + list(labels[i + 1::])
    n_pairs = len(i_list)

    gc_pairs = table.Table()
    gc_pairs['i'] = i_label
    gc_pairs['j'] = j_label
    gc_pairs['jsd'] = np.zeros(len(gc_pairs), dtype='float64')

    n_gmm = len(gmmfit['weights'][0])
    gmm_i = mixture.GaussianMixture(n_components=n_gmm)
    gmm_j = mixture.GaussianMixture(n_components=n_gmm)

    def wrapper(idx):
        i, j = i_list[idx], j_list[idx]
        gmm_i.means_ = gmmfit['means'][i]
        gmm_i.weights_ = gmmfit['weights'][i]
        gmm_i.covariances_ = gmmfit['covars'][i]
        gmm_i.precisions_ = gmmfit['prec'][i]
        gmm_i.precisions_cholesky_ = gmmfit['prec_chol'][i]
        gmm_j.means_ = gmmfit['means'][j]
        gmm_j.weights_ = gmmfit['weights'][j]
        gmm_j.covariances_ = gmmfit['covars'][j]
        gmm_j.precisions_ = gmmfit['prec'][j]
        gmm_j.precisions_cholesky_ = gmmfit['prec_chol'][j]
        jsd = gmm_jsd(gmm_i, gmm_j)
        gc_pairs['jsd'][idx] = jsd
        return jsd

    if multiprocessing:
        pool = Pool(n_pool)
        jsd = pool.map(wrapper, range(n_pairs))
        pool.close()
        gc_pairs['jsd'] = jsd

    else:
        for idx in range(n_pairs):
            wrapper(idx)

    if savefile is not None:

        io.ascii.write(gc_pairs, savefile)

    return gc_pairs
コード例 #15
0
def parallelize_dataframe(df,
                          func,
                          num_partitions=num_cores,
                          num_cores=num_cores):
    df_split = np.array_split(df, num_partitions, axis=0)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    pool.clear()
    return df
コード例 #16
0
ファイル: helper_funcs.py プロジェクト: jfuruness/lib_utils
def Pool(cpus=cpu_count()) -> ProcessingPool:
    """Context manager for pathos ProcessingPool"""

    # Creates a pool with processes
    p = ProcessingPool(cpus)
    yield p
    # Need to clear due to:
    # https://github.com/uqfoundation/pathos/issues/111
    p.close()
    p.join()
    p.clear()
コード例 #17
0
 def get_top_n(self, query, corpus, n=5):
     temp_corpus = [" ".join(ele) for ele in corpus]
     pool = Pathos_Pool(cpu_count())
     corpus_embeddings = self._encode_sentences(temp_corpus)
     query_embeddings = self._encode_sentences(query)
     scores = self._calc_similarity(query_embeddings, corpus_embeddings)
     pool.close()
     top_results = torch.topk(scores, n)
     return [{
         "idx": i,
         "document": corpus[i]
     } for i in top_results[1].numpy().tolist()]
コード例 #18
0
ファイル: utils.py プロジェクト: jfuruness/lib_bgp_data
def Pool(threads: int, multiplier: int, name: str):
    """Context manager for pathos ProcessingPool"""

    # Creates a pool with threads else cpu_count * multiplier
    p = ProcessingPool(threads if threads else cpu_count() * multiplier)
    logging.debug(f"Created {name} pool")
    yield p
    # Need to clear due to:
    # https://github.com/uqfoundation/pathos/issues/111
    p.close()
    p.join()
    p.clear()
コード例 #19
0
def finally_ip():
	(ipList,portList) = get_ip_list(url,headers)
	#运行pool = ThreadPool(2)有时会出现module '__main__' has no attribute '__spec__'错误  不造如何解决
	#尝试过 __spec__=None 的方式  没有什么用
	pool = ThreadPool(4)
	start_time = time.time()
	results = pool.map(test_ip,ipList,portList)
	pool.close()
	pool.join()
	end_time = time.time()
	print("并行耗时:"+str(end_time-start_time))
	return results
コード例 #20
0
def compute_scores(args):
    results_file = args.results_file
    scores_file = args.score_file
    num_captions = args.num_captions
    is_exp = args.exp
    generated_image_tokens = get_generated_tokens(res_file=results_file,
                                                  num=num_captions)
    gt_image_tokens = get_gt_tokens()
    print('number of test images: %d, all images: %d') % (
        len(generated_image_tokens), len(gt_image_tokens))
    all_image_ids = generated_image_tokens.keys()

    def f(image_id_thread):
        image_ids = image_id_thread[0]
        thread_num = image_id_thread[1]
        scores = {}
        for image_id in image_ids:
            res_tokens = generated_image_tokens[str(image_id)]
            gt_tokens = gt_image_tokens[str(image_id)]
            wmd_score = word_mover_distance(res_tokens,
                                            gt_tokens,
                                            wvmodel=wvmodel,
                                            is_exp=is_exp)
            scores[image_id] = wmd_score
            print('Thread: %d, Image ID: %s, WMD score: %.5f') % (
                thread_num, image_id, wmd_score)
        return scores

    num_images = len(all_image_ids)
    num_workers = 20
    num_per_split = num_images // num_workers
    images_split = []
    for i in range(num_workers):
        if i == (num_workers - 1):
            images_split.append([all_image_ids[(i * num_per_split):], i])
        else:
            images_split.append([
                all_image_ids[(i * num_per_split):((i + 1) * num_per_split)], i
            ])
    pool = Pool(num_workers)
    all_scores = pool.map(f, images_split)
    pool.close()
    pool.join()
    scores = {}
    for s in all_scores:
        scores.update(s)
    with open(scores_file, 'w') as f:
        json.dump(scores, f)
    total_score = 0
    for key in scores.keys():
        total_score += scores[key]
    print('WMD score: %.5f') % (total_score / len(scores))
コード例 #21
0
    def get_top_n(self, query, corpus, n=5):
        # scores = np.zeros(self.corpus_size)
        temp_corpus = [" ".join(ele) for ele in corpus]
        pool = Pathos_Pool(cpu_count())
        scores = pool.map(self._calc_distance, [query] * self.corpus_size,
                          temp_corpus)
        pool.close()
        scores = np.array(scores)
        # for i, sent in enumerate(tqdm(corpus)):
        #     scores[i] = self._calc_distance(query, sent)

        top_n = np.argsort(scores)[::-1][:n]
        return [{"idx": i, "document": corpus[i]} for i in top_n]
コード例 #22
0
ファイル: initsync_pipe.py プロジェクト: iagcl/data_pipeline
def parallelise_initsync(argv, ssp_params, process_control_id, logger):
    # Pivot the collection of source_system_profile records into
    # three separate lists to enable us to call pool.map on each record
    (source_schemas, tables, target_schemas,
     query_conditions) = map(list, zip(*ssp_params))

    source_conn_detail = dbuser.get_dbuser_properties(argv.sourceuser)
    target_conn_detail = dbuser.get_dbuser_properties(argv.targetuser)

    logger.info("Processing tables with {} dedicated worker processes".format(
        argv.numprocesses))
    pool = Pool(nodes=argv.numprocesses)

    argvs = [argv] * len(tables)
    source_conn_details = [source_conn_detail] * len(tables)
    target_conn_details = [target_conn_detail] * len(tables)
    pcids = [process_control_id] * len(tables)
    queues = [manager.Queue()] * len(tables)

    logger.debug("Starting a new process for each table in: {tables}".format(
        tables=tables))
    # Execute initsync for each schema/table combination in parallel
    pool.map(initsync_table,
             argvs,
             source_conn_details,
             target_conn_details,
             source_schemas,
             tables,
             target_schemas,
             pcids,
             query_conditions,
             queues,
             chunksize=1)  # Ensure tables are processed in sequence
    # and workers are fully utilised

    pool.close()
    logger.debug("parallelise_initsync: Pool joining")
    pool.join()
    logger.debug("parallelise_initsync: Pool joined")

    all_table_results = {}
    for q in queues:
        size = q.qsize()
        message = q.get()
        logger.debug("Message queue size = {s}, message = {m}".format(
            s=size, m=message))
        all_table_results.update(message)

    logger.debug("all_table_results = {r}".format(r=all_table_results))
    return all_table_results
コード例 #23
0
ファイル: multiprocessing.py プロジェクト: notsoprocoder/mupa
 def parallel_apply(self, df, func):
     # add try statement re function not returning a DataFrame
     if self.preprocessing_checks(df, func):
         # split DataFrame into a list of smaller DataFrames
         self.df_split = np.array_split(df, self.partitions, axis=0)
         # create the multiprocessing pool
         pool = Pool(self.cores)
         # process the DataFrame by mapping function to each df across the pool
         df = pd.concat(pool.map(func, self.df_split), axis=0).copy()
         # close down the pool and join
         pool.close()
         pool.join()
         pool.clear()
         return df
コード例 #24
0
def get_gt_tokens(
        coco_file='../data/files/dataset_coco.json',
        coco_tokens_file='../data/files/coco_tokens_Google_news.json'):
    if os.path.exists(coco_tokens_file):
        with open(coco_tokens_file, 'r') as f:
            dataset = json.load(f)
        return dataset

    print 'Processing ground-truth data...'
    with open(coco_file, 'r') as f:
        dataset = json.load(f)

    def f(images):
        image_tokens = {}
        # images = dataset['images']
        for image in images:
            sentence = image['sentences']
            image_id = str(image['cocoid'])
            tokens = []
            for s in sentence:
                tokens.extend(s['tokens'])
            filter_token = []
            for token in tokens:
                if (token not in stop_words) and (token in vocab):
                    filter_token.append(token)
            # tokens = [token for token in tokens if token not in stop_words and token in vocab]
            image_tokens[image_id] = filter_token
        return image_tokens

    all_images = dataset['images']
    num_images = len(all_images)
    num_workers = 30
    num_per_split = num_images // num_workers
    images_split = []
    for i in range(num_workers):
        if i == (num_workers - 1):
            images_split.append(all_images[(i * num_per_split):])
        else:
            images_split.append(
                all_images[(i * num_per_split):((i + 1) * num_per_split)])
    pool = Pool(num_workers)
    all_images_tokens = pool.map(f, images_split)
    pool.close()
    pool.join()
    all_token_dict = {}
    for d in all_images_tokens:
        all_token_dict.update(d)
    with open(coco_tokens_file, 'w') as f:
        json.dump(all_token_dict, f)
    return all_token_dict
コード例 #25
0
def makeRadial():
    rad, angle = d["radial"]["rad"], d["radial"]["angle"]
    args = np.linspace(angle, angle + np.pi, frameCount)

    pool = Pool(4)

    while True:
        subIm = JuliaTools.subImage(c=rad * np.exp(1j * angle),
                                    r=r,
                                    n=10,
                                    p=p,
                                    iters=iters,
                                    split=split,
                                    save=False,
                                    aura=False)
        isBlackList = pool.map(subIm, coords)
        if not all(isBlackList):
            break
        else:
            rad *= 0.975

    # Circular arc c follows in complex plane
    cPath = rad * np.exp(1j * args)

    for frame in xrange(frameCount):
        subIm = JuliaTools.subImage(c=cPath[frame],
                                    r=r,
                                    n=n,
                                    p=p,
                                    iters=iters,
                                    split=split)
        isBlackList = pool.map(subIm, coords)
        allBlack = all(isBlackList)

        if not allBlack:
            JuliaTools.makeFrame(frame, n, split, coords)

    pool.close()

    JuliaTools.prepareForFFmpeg(frameCount=frameCount, loop=True)

    with open("tweet.txt", "w") as out:
        out.write("Images generated using constants"
                  " on a circular arc of radius {:03.2f}.".format(rad))

    stop = timeit.default_timer()

    print stop - start
コード例 #26
0
def apply_by_multiprocessing(df, func, **kwargs):
    """
    Parallel execution function for the DataFrame
    :param df: Input DataFrame
    :param func:
    :param kwargs: additional arguments for the df.apply() such as axis and et al.
    :return: Output DataFrame
    """
    workers = kwargs.pop('workers')
    pool = Pool(processes=workers)
    result = pool.map(_apply_df,
                      [(d, func, i, kwargs)
                       for i, d in enumerate(np.array_split(df, workers))])
    pool.close()
    result = sorted(result, key=lambda x: x[0])
    return pd.concat([i[1] for i in result])
コード例 #27
0
    def make_query(self, size=1):

        ## quit if nr_unlabeled_samples = 1
        if self.dataset.len_unlabeled() == 1:
            return self.dataset.get_unlabeled_entries()[0].astype(int)

        ## Set the possible labels
        self.possible_labels = list(set(self.dataset.get_labeled_entries()[1]))

        ## Train the model
        self.model.train(self.dataset)

        ## Get probabilities
        X_ids, X = self.dataset.get_unlabeled_entries()
        pred = self.model.predict_proba(
            X)  # pred.shape = (n_unlabeled, nr_of_labels)

        ## Setup pool for cpu parallelisation
        p = Pool(cpu_count(), maxtasksperchild=1000)

        ## nr of unlabeled samples -> len(X)

        ## Get uncertainty after adding every sample with every label
        total = np.asarray(
            p.map(self._eer, X_ids,
                  len(X) * [self.dataset],
                  len(X) * [self.depth]))
        # total.shape = (n_unlabeled, nr_of_labels)

        ## Close the Pool again
        p.close()
        p.join()
        p.clear()

        ## Get the total uncertainty of one sample after adding a label weighted by the labels probability
        total = np.inner(
            pred,
            total,
        ).diagonal()  # total.shape = (n_unlabeled,)

        ## Zip it
        total = zipit(X_ids, total)

        ## Sort it
        results = sort_by_2nd(total, 'min')

        return results[:size, 0].astype(int)
コード例 #28
0
    def start(self, text_data_dir, res_dir, nprocs=8):
        '''
        entry function
    
        text_data_dir: folder of raw data
        text_res_dir: folder of output
        verbose: int. Information is printed every N records
        nprocs: number of cores in parallel
        '''
        p = PathosPool(nprocs)

        filepathsvec, filenamesvec, respaths = list(), list(), list()
        for dirpath, _, filenames in os.walk(text_data_dir):
            for filename in filenames:
                if (("gz" in filename) and ('md5' not in filename)
                        and ('copy' not in filename)):
                    filepath = os.path.join(dirpath, filename)
                    print(filepath)
                    res_name = filename.split(".")[0] + ".csv.gz"
                    respath = os.path.join(res_dir, res_name)
                    #if os.path.exists(respath):
                    # pass
                    #else:
                    if True:
                        filepathsvec.append(filepath)
                        filenamesvec.append(filename)
                        respaths.append(respath)
                        #p.apply_async(process_data, args = (filepath,filename,
                        # respath, True,
                        # [title_stop_path,
                        #  affil_stop_path,
                        #  mesh_stop_path]))
        self.affildicts = p.amap(
            partial(self.process_data,
                    stop_paths=[
                        self.title_stop_path, self.affil_stop_path,
                        self.mesh_stop_path
                    ],
                    rm_stopwords=True,
                    affiliation_correction=True,
                    select_journals=self.select_journals), filepathsvec,
            filenamesvec, respaths)

        p.close()
        p.join()  # Having an issue joining
        print("joined")
        p.clear()  # Delete the pool
コード例 #29
0
def makePower():
    global c
    pMin, pMax = d["power"]["pMin"], d["power"]["pMax"]

    pPath = np.linspace(pMin, pMax, frameCount)

    pool = Pool(4)

    # Get interesting c
    while True:
        subIm = JuliaTools.subImage(c=c,
                                    n=10,
                                    iters=iters / 2,
                                    r=r,
                                    p=pMin,
                                    split=split,
                                    save=False,
                                    aura=False)
        isBlackList = pool.map(subIm, coords)
        if not all(isBlackList):
            break
        else:
            c *= 0.975

    for frame in xrange(frameCount):
        subIm = JuliaTools.subImage(c=c,
                                    r=r,
                                    n=n,
                                    p=pPath[frame],
                                    iters=iters / 2,
                                    split=split)
        isBlackList = pool.map(subIm, coords)
        allBlack = all(isBlackList)

        if not allBlack:
            JuliaTools.makeFrame(frame, n, split, coords)

    pool.close()

    JuliaTools.prepareForFFmpeg(frameCount=frameCount, loop=True)

    with open("tweet.txt", "w") as out:
        out.write("woooooooooooooooooooo")

    stop = timeit.default_timer()

    print stop - start
コード例 #30
0
    def _mp_improve(self, container, scenario_builder):
        """Improves b/2 best solutions from the container and updates
        the score table with the generated solutions
        """
        container.sort()
        pool = Pool(processes=self._proc_count)

        logging.info("Starting processes")
        start = datetime.now()
        best = []
        builders = []
        for i in range(self._b/2):
            best.append(container.get(i))
            builders.append(scenario_builder)

        try:
            result = pool.map(self._improve, best, builders)
            pool.close()
            pool.join()
        except MemoryError as e:
            send_email("I crashed again, please help!")
            import pudb
            pudb.set_trace()
            print(e.message())

        logging.info("Processes finished - %s" % (datetime.now() - start))
        # How infuriating was that?!
        # pathos was being smart and was caching pool so this is needed
        # to prevent from erroring out
        pool.restart()

        start = datetime.now()
        logging.info("mp_improve second loop")
        for entry in result:
            index = container.index(entry['individual'])
            best = entry['improvements'].get(0)
            if best.get_utility() < entry['individual'].get_utility():
                container.replace(best, index)

            for improvement in entry['improvements'].get_all():
                self._update_score_table(improvement)

        logging.info("mp_improve second loop - %s" % (datetime.now() - start))
        logging.info("Improved %d solutions" % container.get_changes())
        container.reset_changes()
        return container
コード例 #31
0
class analyze(setup.setup):

    def __init__(self,args,logging_level=logging.INFO):

         super(analyze, self ).__init__(args,logging_level)


    # set up processing pool and run all analyses specified in args
    def run(self):


        if self.args.jumpdists:
            n_bins=100.
            bin_width = 1/n_bins
            bins = np.arange(0,1+bin_width,1/n_bins)

            if self.args.file:
                user,vals = self.artist_jump_distributions(self.args.file,bins=bins,self_jumps=False)
                with open(self.args.resultdir+user,'w') as fout:
                    fout.write(','.join(vals.astype(str))+'\n')



            else:
                raise('not implemented!')
                self.pool = Pool(self.args.n)
                self.rootLogger.info("Pool started")

                self.rootLogger.info("Starting jump distance analysis")

                func_partial = partial(self.artist_jump_distributions,bins=bins,self_jumps=False)
                with open(self.args.resultdir+'jumpdists','w') as fout:
                    for user,vals in self.pool.imap(func_partial,self.listen_files):
                        fout.write(user+'\t'+','.join(vals.astype(str))+'\n')

                self.pool.close()
                self.rootLogger.info("Pool closed")

        if self.args.blockdists:
            #self.rootLogger.info("Starting block distance analysis")
            self.mean_block_distances(self.args.file)

        if self.args.diversity_dists:
            bins = np.arange(0,1.01,.01)
            self.diversity_distributions(self.args.file,bins=bins)

        if self.args.clustering:
            self.clustering(self.args.file)

        if self.args.values:
            self.patch_values(self.args.file)

        if self.args.exp:
            self.explore_exploit(self.args.file)

        if self.args.patch_len_dists:
            self.patch_len_dists(self.args.file)


    # calculate distribution (using histogram with specified bins)
    # of sequential artist-to-artist distances
    def artist_jump_distributions(self,fi,bins,self_jumps=False):
        user = fi.split('/')[-1][:-4]
        df = pd.read_pickle(fi)
        if self_jumps:
            vals = np.histogram(df['dist'].dropna(),bins=bins)[0]
        else:
            vals = np.histogram(df['dist'][df['dist']>0],bins=bins)[0]
        self.rootLogger.info('artist jump distances done for user {} ({})'.format(user,fi))
        return user,vals

    # calculate distribution (using histogram with specified bins)
    # of patch diversity for each user

    # awk 'FNR==1' * > diversity_dists_zeros
    # awk 'FNR==2' * > diversity_dists_nozeros
    def diversity_distributions(self,fi,bins):
        if 'patches' not in fi:
            raise('WRONG DATATYPE')
        user = fi.split('/')[-1].split('_')[0]
        df = pd.read_pickle(fi).dropna(subset=['diversity'])
        zeros = np.histogram(df[df['n']>=5]['diversity'],bins=bins)[0]
        nozeros = np.histogram(df[(df['n']>=5)&(df['diversity']>0)]['diversity'],bins=bins)[0]

        zeros = zeros/float(zeros.sum())
        nozeros = nozeros/float(nozeros.sum())

        with open(self.args.resultdir+user,'w') as fout:
            fout.write(user+'\t'+'zeros'+'\t'+','.join(zeros.astype(str))+'\n')
            fout.write(user+'\t'+'nozeros'+'\t'+','.join(nozeros.astype(str))+'\n')
        self.rootLogger.info('diversity distributions done for user {} ({})'.format(user,fi))


    def mean_block_distances(self,fi,n=100):

        def cos_nan(arr1,arr2):
            if np.any(np.isnan(arr1)) or np.any(np.isnan(arr2)):
                return np.nan
            else:
                return cosine(arr1,arr2)


        user = fi.split('/')[-1].split('_')[0]
        df = pd.read_pickle(fi)
        blocks = df[df['n']>=5].dropna()

        result = []
        for i in xrange(len(blocks)-n):
            first = blocks['centroid'].iloc[i]
            result.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first))))
        result = np.nanmean(np.vstack(result),0)

        with open(self.args.resultdir+user,'w') as fout:
            fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n')

        self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi))


        # now shuffled
        # idx = np.array(blocks.index)
        # np.random.shuffle(idx)
        # blocks = blocks.reindex(idx)

        # result_random = []
        # for i in xrange(len(blocks)-n):
        #     first = blocks['centroid'].iloc[i]
        #     result_random.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first))))
        # result_random = np.nanmean(np.vstack(result_random),0)

        # with open(self.args.resultdir+user,'w') as fout:
        #     fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n')
        #     fout.write('\t'.join([user,'patch_random',','.join(result_random.astype(str))])+'\n')
        # self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi))

    def clustering(self,fi):
        df = pd.read_pickle(fi)
        user = fi.split('/')[-1].split('_')[0]

        mask = (df['centroid'].apply(lambda arr: ~np.any(np.isnan(arr))).values)&(df['n']>=5)&(df['diversity']<=0.2)
        clust_data = df[mask].reset_index()
        arr =  np.vstack(clust_data['centroid'])
        Z = linkage(arr, 'complete')
        clusters = fcluster(Z,t=0.2,criterion='distance')
        assignments = np.repeat(np.nan,len(df))
        assignments[np.where(mask)] = clusters
        df['patch_clust'] = assignments
        df.to_pickle('{}{}.pkl'.format(self.args.resultdir,user))
        self.rootLogger.info('Patch clusters for user {} processed successfully ({})'.format(user,fi))

    def patch_len_dists(self,fi):
        df = pd.read_pickle(fi)
        user = fi.split('/')[-1][:-4]

        explore = df[np.isnan(df['patch_clust'])]
        result_explore = explore['n'].value_counts()

        df['explore'] = np.isnan(df['patch_clust']).astype(int)
        df['explore-idx'] = df['explore'].cumsum()

        result_exploit =  df.groupby('explore-idx').apply(lambda df: df.dropna()['n'].sum()).value_counts()

        result_explore = result_explore.reindex(xrange(1,max(result_explore.index)+1),fill_value=0.).values
        result_exploit = result_exploit.reindex(xrange(1,max(result_exploit.index)+1),fill_value=0.).values

        result_explore = sparse.csr_matrix(result_explore)
        result_exploit = sparse.csr_matrix(result_exploit)


        with open(self.args.resultdir+user,'w') as fout:
            fout.write(user+'\t'+'explore'+'\t'+':'.join([','.join(a.astype(str)) for a in result_explore.data,result_explore.indices,result_explore.indptr])+'\n')
            fout.write(user+'\t'+'exploit'+'\t'+':'.join([','.join(a.astype(str)) for a in result_exploit.data,result_exploit.indices,result_exploit.indptr])+'\n')
        self.rootLogger.info('User {} processed successfully ({})'.format(user,fi))



    def explore_exploit(self,fi):

        user = fi.split('/')[-1][:-4]

        df_patches_raw = pd.read_pickle(fi)

        # add time in next bout
        df_patches_raw['next_n'] = df_patches_raw['n'].shift(-1)

        # add patch values
        # listensPerPatch = df_patches_raw.groupby('patch_clust')['n'].sum()
        # overall_prop = listensPerPatch/float(df_patches_raw['n'].sum())
        # overall_prop.name = 'final_value'
        # df_patches_raw = df_patches_raw.join(overall_prop,on='patch_clust')


        """
        # time in next exploit patch as function of exploration time
        result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['next_n'].mean()

        fout.write(user+'\t'+'next-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """
        # total time exploiting as a function of time exploring
        df_patches_raw['explore'] = np.isnan(df_patches_raw['patch_clust']).astype(int)
        df_patches_raw['explore-idx'] = df_patches_raw['explore'].cumsum()

        # combine all exploit listens
        #grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'][1:].sum()]}))

        # only last exploit bout
        grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'].iloc[-1]]}))

        #result = grp_explore.groupby('n')['n-exploit'].mean()
        #fout.write(user+'\t'+'total-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """
        # exploration time as a function of exploitation time
        grp_exploit = grp_explore.copy()
        grp_exploit['n-explore'] = grp_exploit['n'].shift(-1)

        result = grp_exploit.groupby('n-exploit')['n-explore'].mean()
        fout.write(user+'\t'+'explore-vs-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """

        # prob exploit given explore time - already done

        # explore_only = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])]
        # result = explore_only['n'][:-1].value_counts()
        # arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values
        # final_result = arr/(np.cumsum(arr[::-1])[::-1])
        # final_result = sparse.csr_matrix(final_result)

        # with open(self.args.resultdir+user+'_exploit','w') as fout:
        #     fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n')


        # prob explore given exploit time
        result = grp_explore['n-exploit'][grp_explore['n-exploit']>0].value_counts()
        arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values
        final_result = arr/np.cumsum(arr[::-1])[::-1]
        final_result = sparse.csr_matrix(final_result)

        with open(self.args.resultdir+user+'_explore','w') as fout:
            fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n')


        #fout.write(user+'\t'+'prob-explore-given-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')

        """
        # patch value as a function of exploration time
        df_patches_raw['final_value_next'] = df_patches_raw['final_value'].shift(-1)
        result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['final_value_next'].mean()
        fout.write(user+'\t'+'exploit-value-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """

        self.rootLogger.info('User {} processed successfully ({})'.format(user,fi))
コード例 #32
0

def genseq(idx):

    first = np.where(np.random.multinomial(1,pvals=pops)==1)[0][0]
    last = first
    last_ts = datetime.now()
    result = {'artist_idx':[first],'ts':[last_ts]}
    for i in xrange(seq_length-1):
        next_listen = draw(last)
        last = next_listen
        gap_bin = 120*np.where(np.random.multinomial(1,pvals=td)==1)[0][0]
        gap = np.random.randint(gap_bin,gap_bin+120)
        result['artist_idx'].append(next_listen)
        new_ts = last_ts+timedelta(0,gap)
        result['ts'].append(new_ts)
        last_ts = new_ts

    df = pd.DataFrame(result)
    df['block'] = ((df['artist_idx'].shift(1) != df['artist_idx']).astype(int).cumsum())-1
    df.to_pickle(str(idx)+'.pkl')
    logging.info('idx {} complete'.format(idx))

pool = Pool(cpu_count())
indices = range(n)
pool.map(genseq,indices)
pool.close()



コード例 #33
0
    def integrate_model(self, n_realizations, int_length = None, noise_type = 'white', sigma = 1., n_workers = 3, diagnostics = True):
        """
        Integrate trained model.
        noise_type:
        -- white - classic white noise, spatial correlation by cov. matrix of last level residuals
        -- cond - find n_samples closest to the current space in subset of n_pcs and use their cov. matrix
        -- seasonal - seasonal dependence of the residuals, fit n_harm harmonics of annual cycle, could also be used with cond.
        except 'white', one can choose more settings like ['seasonal', 'cond']
        """

        if self.verbose:
            print("preparing to integrate model...")

        pcs = self.input_pcs.copy()
        pcs = pcs.T # time x dim

        pcmax = np.amax(pcs, axis = 0)
        pcmin = np.amin(pcs, axis = 0)
        self.varpc = np.var(pcs, axis = 0, ddof = 1)
        
        self.int_length = pcs.shape[0] if int_length is None else int_length

        self.diagnostics = diagnostics

        if self.harmonic_pred in ['all', 'first']:
            if self.verbose:
                print("...using harmonic predictors (with annual frequency)...")
            self.xsin = np.sin(2*np.pi*np.arange(self.int_length) / 12.)
            self.xcos = np.cos(2*np.pi*np.arange(self.int_length) / 12.)

        if self.verbose:
            print("...preparing noise forcing...")

        self.sigma = sigma
        if isinstance(noise_type, basestring):
            if noise_type not in ['white', 'cond', 'seasonal']:
                raise Exception("Unknown noise type to be used as forcing. Use 'white', 'cond', or 'seasonal'.")
        elif isinstance(noise_type, list):
            noise_type = frozenset(noise_type)
            if not noise_type.issubset(set(['white', 'cond', 'seasonal'])):
                raise Exception("Unknown noise type to be used as forcing. Use 'white', 'cond', or 'seasonal'.")
        
        self.last_level_res = self.residuals[max(self.residuals.keys())]
        self.noise_type = noise_type
        if noise_type == 'white':
            if self.verbose:
                print("...using spatially correlated white noise...")
            Q = np.cov(self.last_level_res, rowvar = 0)
            self.rr = np.linalg.cholesky(Q).T

        if 'seasonal' in noise_type:
            n_harmonics = 5
            if self.verbose:
                print("...fitting %d harmonics to estimate seasonal modulation of last level's residual..." % n_harmonics)
            if self.delay_model:
                resid_delayed = self.last_level_res[-(self.last_level_res.shape[0]//12)*12:].copy()
                rr_last = np.reshape(resid_delayed, (12, self.last_level_res.shape[0]//12, self.last_level_res.shape[1]), order = 'F')
            else:
                rr_last = np.reshape(self.last_level_res, (12, self.last_level_res.shape[0]//12, self.last_level_res.shape[1]), order = 'F')
            rr_last_std = np.nanstd(rr_last, axis = 1, ddof = 1)
            predictors = np.zeros((12, 2*n_harmonics + 1))
            for nh in range(n_harmonics):
                predictors[:, 2*nh] = np.cos(2*np.pi*(nh+1)*np.arange(12) / 12)
                predictors[:, 2*nh+1] = np.sin(2*np.pi*(nh+1)*np.arange(12) / 12)
            predictors[:, -1] = np.ones((12,))
            bamp = np.zeros((predictors.shape[1], pcs.shape[1]))
            for k in range(bamp.shape[1]):
                bamp[:, k] = np.linalg.lstsq(predictors, rr_last_std[:, k])[0]
            rr_last_std_ts = np.dot(predictors, bamp)
            self.rr_last_std_ts = np.repeat(rr_last_std_ts, repeats = self.last_level_res.shape[0]//12, axis = 0)
            if self.delay_model:
                resid_delayed /= self.rr_last_std_ts
                Q = np.cov(resid_delayed, rowvar = 0)
            else:
                self.last_level_res /= self.rr_last_std_ts
                Q = np.cov(self.last_level_res, rowvar = 0)

            self.rr = np.linalg.cholesky(Q).T


        if diagnostics:
            if self.verbose:
                print("...running diagnostics for the data...")
            # ACF, kernel density, integral corr. timescale for data
            self.max_lag = 50
            lag_cors = np.zeros((2*self.max_lag + 1, pcs.shape[1]))
            kernel_densities = np.zeros((100, pcs.shape[1], 2))
            for k in range(pcs.shape[1]):
                lag_cors[:, k] = cross_correlation(pcs[:, k], pcs[:, k], max_lag = self.max_lag)
                kernel_densities[:, k, 0], kernel_densities[:, k, 1] = kdensity_estimate(pcs[:, k], kernel = 'epanechnikov')
            integral_corr_timescale = np.sum(np.abs(lag_cors), axis = 0)

            # init for integrations
            lag_cors_int = np.zeros([n_realizations] + list(lag_cors.shape))
            kernel_densities_int = np.zeros([n_realizations] + list(kernel_densities.shape))
            stat_moments_int = np.zeros((4, n_realizations, pcs.shape[1])) # mean, variance, skewness, kurtosis
            int_corr_scale_int = np.zeros((n_realizations, pcs.shape[1]))

        self.diagpc = np.diag(np.std(pcs, axis = 0, ddof = 1))
        self.maxpc = np.amax(np.abs(pcs))
        self.diagres = {}
        self.maxres = {}
        for l in self.residuals.keys():
            self.diagres[l] = np.diag(np.std(self.residuals[l], axis = 0, ddof = 1))
            self.maxres[l] = np.amax(np.abs(self.residuals[l]))

        self.pcs = pcs

        if n_workers > 1:
            # from multiprocessing import Pool
            from pathos.multiprocessing import ProcessingPool
            pool = ProcessingPool(n_workers)
            map_func = pool.amap
            if self.verbose:
                print("...running integration of %d realizations using %d workers..." % (n_realizations, n_workers))
        else:
            map_func = map
            if self.verbose:
                print("...running integration of %d realizations single threaded..." % n_realizations)

        rnds = []
        for n in range(n_realizations):
            r = {}
            for l in self.fit_mat.keys():
                if l == 0:
                    if self.delay_model:
                        r[l] = np.dot(self.diagpc, np.random.normal(0, sigma, (pcs.shape[1], self.delay)))
                    else:
                        r[l] = np.dot(np.random.normal(0, sigma, (pcs.shape[1],)), self.diagpc)
                else:
                    if self.delay_model:
                        r[l] = np.dot(self.diagres[l-1], np.random.normal(0, sigma, (pcs.shape[1], self.delay)))
                    else:
                        r[l] = np.dot(np.random.normal(0, sigma, (pcs.shape[1],)), self.diagres[l-1])
            rnds.append(r)
        args = [[i, rnd, noise_type] for i, rnd in zip(range(n_realizations), rnds)]
        results = map_func(self._process_integration, args)

        del args
        if n_workers > 1:
            pool.close()

        self.integration_results = np.zeros((n_realizations, pcs.shape[1], self.int_length))
        self.num_exploding = np.zeros((n_realizations,))

        if n_workers > 1:
            results = results.get()

        if self.diagnostics:
            # x, num_exploding, xm, xv, xs, xk, lc, kden, ict
            for i, x, num_expl, xm, xv, xs, xk, lc, kden, ict in results:
                self.integration_results[i, ...] = x.T
                self.num_exploding[i] = num_expl
                stat_moments_int[0, i, :] = xm
                stat_moments_int[1, i, :] = xv
                stat_moments_int[2, i, :] = xs
                stat_moments_int[3, i, :] = xk
                lag_cors_int[i, ...] = lc
                kernel_densities_int[i, ...] = kden
                int_corr_scale_int[i, ...] = ict
        else:
            for i, x, num_expl in results:
                self.integration_results[i, ...] = x.T
                self.num_exploding[i] = num_expl

        if self.verbose:
            print("...integration done, now saving results...")

        if self.verbose:
            print("...results saved to structure.")
            print("there was %d expolding integration chunks in %d realizations." % (np.sum(self.num_exploding), n_realizations))
        
        if self.diagnostics:
            if self.verbose:
                print("plotting diagnostics...")
            
            import matplotlib.pyplot as plt
            # plot all diagnostic stuff
            ## mean, variance, skewness, kurtosis, integral corr. time scale
            t**s = ['MEAN', 'VARIANCE', 'SKEWNESS', 'KURTOSIS', 'INTEGRAL CORRELATION TIME SCALE']
            plot = [np.mean(pcs, axis = 0), np.var(pcs, axis = 0, ddof = 1), sts.skew(pcs, axis = 0), sts.kurtosis(pcs, axis = 0), integral_corr_timescale]
            xplot = np.arange(1, pcs.shape[1]+1)
            for i, tit, p in zip(range(5), t**s, plot):
                plt.figure()
                plt.title(tit, size = 20)
                plt.plot(xplot, p, linewidth = 3, color = '#3E3436')
                if i < 4:
                    plt.plot(xplot, np.percentile(stat_moments_int[i, :, :], q = 2.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36')
                    plt.plot(xplot, np.percentile(stat_moments_int[i, :, :], q = 97.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36')
                else:
                    plt.plot(xplot, np.percentile(int_corr_scale_int, q = 2.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36')
                    plt.plot(xplot, np.percentile(int_corr_scale_int, q = 97.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36')
                plt.xlabel("# PC", size = 15)
                plt.xlim([xplot[0], xplot[-1]])
                plt.show()
                plt.close()

            ## lagged correlations, PDF - plot first 9 PCs (or less if input number of pcs is < 9)
            t**s = ['AUTOCORRELATION', 'PDF']
            plot = [[lag_cors, lag_cors_int], [kernel_densities, kernel_densities_int]]
            xlabs = ['LAG', '']
            for i, tit, p, xlab in zip(range(2), t**s, plot, xlabs):
                plt.figure()
                plt.suptitle(tit, size = 25)
                no_plts = 9 if self.no_input_ts > 9 else self.no_input_ts
                for sub in range(0,no_plts):
                    plt.subplot(3, 3, sub+1)
                    if i == 0:
                        xplt = np.arange(0, self.max_lag+1)
                        plt.plot(xplt, p[0][p[0].shape[0]//2:, sub], linewidth = 3, color = '#3E3436')
                        plt.plot(xplt, np.percentile(p[1][:, p[0].shape[0]//2:, sub], q = 2.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36')
                        plt.plot(xplt, np.percentile(p[1][:, p[0].shape[0]//2:, sub], q = 97.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36')
                        plt.xlim([xplt[0], xplt[-1]])
                    else:
                        plt.plot(p[0][:, sub, 0], p[0][:, sub, 1], linewidth = 3, color = '#3E3436')
                        plt.plot(p[1][0, :, sub, 0], np.percentile(p[1][:, :, sub, 1], q = 2.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36')
                        plt.plot(p[1][0, :, sub, 0], np.percentile(p[1][:, :, sub, 1], q = 97.5, axis = 0), '--', linewidth = 2.5, color = '#EA3E36')
                        plt.xlim([p[0][0, sub, 0], p[0][-1, sub, 0]])
                    plt.xlabel(xlab, size = 15)
                    plt.title("PC %d" % (int(sub)+1), size = 20)
                # plt.tight_layout()
                plt.show()
                plt.close()
コード例 #34
0
ファイル: setup.py プロジェクト: jlorince/MusicForaging
class setup(object):

    # init just takes in command line arguments and sets up logging
    def __init__(self, args, logging_level=logging.INFO):

        self.args = args

        # logger setup
        now = datetime.datetime.now()
        log_filename = now.strftime("setup_%Y%m%d_%H%M%S.log")
        logFormatter = logging.Formatter("%(asctime)s\t[%(levelname)s]\t%(message)s")
        self.rootLogger = logging.getLogger()
        # fileHandler = logging.FileHandler(log_filename)
        # fileHandler.setFormatter(logFormatter)
        # self.rootLogger.addHandler(fileHandler)
        consoleHandler = logging.StreamHandler()
        consoleHandler.setFormatter(logFormatter)
        self.rootLogger.addHandler(consoleHandler)
        self.rootLogger.setLevel(logging_level)

        # self.rootLogger.info("Input arguments: "+str(args))

        if self.args.feature_path:
            features = np.load(self.args.feature_path)
            self.n_features = features.shape[1]
            self.features = {i: features[i] for i in xrange(len(features))}

    @staticmethod
    def userFromFile(fi):
        return fi.split("/")[-1].split("_")[-1][:-4]

    # set up processing pool and run all analyses specified in args
    def run(self):

        if self.args.preprocess:
            # self.rootLogger.info("Starting preprocessing")
            self.preprocess()
            # self.rootLogger.info("Preprocessing complete")

        if self.args.patch_basis is not None:
            # self.rootLogger.info("Starting patch summaries")
            self.summarize_patches()
            # self.rootLogger.info("Patch summaries complete")

        if self.args.blockdists:
            # self.rootLogger.info("Starting block distance analysis")
            self.mean_block_distances(self.args.file)

        if self.args.blockgaps:
            # self.rootLogger.info("Starting block distance analysis")
            self.blockgaps(self.args.file)

        if self.args.scrobblegaps:
            # self.rootLogger.info("Starting block distance analysis")
            self.scrobble_gaps(self.args.file)

        if self.args.ee_artists:
            self.ee_artists(self.args.file)

        if self.args.ee_artists_2:
            self.ee_artists_2(self.args.file)

        if self.args.ee_artists_dists:
            self.ee_artists_dists(self.args.file)

        if self.args.block_len_dists:
            self.block_len_dists(self.args.file)

    # Calls preprocessing code to load raw text files and convert to dataframes, adding features, disances, etc.
    def preprocess(self):

        self.artist_idx_feature_map = {}
        for line in open(self.args.suppdir + "artist_idx_feature_map"):
            k, v = line.strip().split("\t")
            self.artist_idx_feature_map[float(k)] = int(v)

        if self.args.file:
            result = self.processor(
                fi=self.args.file,
                output_dir=self.args.pickledir,
                is_sorted=True,
                features=self.features,
                dist=self.args.distance_metric,
                session_threshold=self.args.session_thresh,
                dist_threshold=self.args.dist_thresh,
                min_patch_length=self.args.min_patch_length,
                artist_idx_feature_map=self.artist_idx_feature_map,
            )

            # if self.args.patch_len_dist:
            #     user,vals_simple,vals_shuffle = result
            #     with open(self.args.resultdir+user,'a') as fout:
            #         if vals_simple is not None:
            #             fout.write('\t'.join([user,'simple',str(self.args.dist_thresh)])+'\t'+','.join(vals_simple.astype(str))+'\n')
            #         fout.write('\t'.join([user,'shuffle',str(self.args.dist_thresh),str(self.args.min_patch_length)])+'\t'+','.join(vals_shuffle.astype(str))+'\n')

        else:
            if args.rawtext:
                if self.args.skip_complete:
                    done = set(
                        [
                            self.userFromFile(fi)
                            for fi in glob(self.args.pickledir + "*.pkl")
                            if "_patches_" not in fi and fi.startswith(self.args.prefix_output)
                        ]
                    )
                else:
                    done = set()
                files = [fi for fi in glob(self.args.datadir + "*.txt") if self.userFromFile(fi) not in done]
            else:
                if self.args.skip_complete:
                    done = set(
                        [
                            self.userFromFile(fi)
                            for fi in glob(self.args.pickledir + "*.pkl")
                            if "_patches_" not in fi and fi.startswith(self.args.prefix_output)
                        ]
                    )
                else:
                    done = set()
                files = [
                    fi
                    for fi in glob(self.args.pickledir + "*.pkl")
                    if "_patches_" not in fi
                    and fi.startswith(self.args.prefix_input)
                    and self.userFromFile(fi) not in done
                ]

            self.n_files = len(files)

            self.rootLogger.debug(files)

            func_partial = partial(
                self.processor,
                output_dir=self.args.pickledir,
                is_sorted=True,
                features=self.features,
                dist=self.args.distance_metric,
                session_threshold=self.args.session_thresh,
                dist_threshold=self.args.dist_thresh,
                min_patch_length=self.args.min_patch_length,
                artist_idx_feature_map=self.artist_idx_feature_map,
            )

            self.pool = Pool(self.args.n)
            self.rootLogger.info("Pool started")
            self.pool.map(func_partial, files)
            self.pool.close()
            self.rootLogger.info("Pool closed")

    # Jensen Shannon Distance (Sqrt of Jensen Shannon Divergence)
    @staticmethod
    def JSD(P, Q):
        if np.all(np.isnan(P)) or np.all(np.isnan(Q)):
            return np.nan
        _P = P / norm(P, ord=1)
        _Q = Q / norm(Q, ord=1)
        _M = 0.5 * (_P + _Q)
        return np.sqrt(np.clip(0.5 * (entropy(_P, _M) + entropy(_Q, _M)), 0, 1))

    # Calculate distance between any two feature arrays
    def calc_dist(self, idx_1, idx_2, metric="cosine"):
        features1 = self.get_features(idx_1)
        features2 = self.get_features(idx_2)
        if np.any(np.isnan(features1)) or np.any(np.isnan(features2)):
            return np.nan
        if np.all(features1 == features2):
            return 0.0
        if metric == "JSD":
            return self.JSD(features1, features2)
        elif metric == "cosine":
            return cosine(features1, features2)
        elif metric == "euclidean":
            return euclidean(features1, features2)

    # "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    @staticmethod
    def pairwise(iterable):
        a, b = tee(iterable)
        next(b, None)
        return izip(a, b)

    # segment patch, generating both simple and shuffle-based indices
    def patch_segmenter(self, df, metric, min_length, dist_thresh):
        l = df["artist_idx"]
        indices = list(np.array([len(list(v)) for g, v in groupby(l)][:-1]).cumsum())
        new_indices = []

        for b in indices:
            dist = self.calc_dist(df.iloc[b]["artist_idx"], df.iloc[b - 1]["artist_idx"], metric=metric)

            if (np.isnan(dist)) or (dist >= dist_thresh):
                new_indices.append(b)

        if new_indices:

            last_patch = False
            final_indices = []
            for i, (a, b) in enumerate(self.pairwise([0] + new_indices + [len(df)])):
                if b - a >= min_length:
                    if a > 0:
                        final_indices.append(a)
                    last_patch = True
                else:
                    if last_patch:
                        final_indices.append(a)
                    last_patch = False

            return final_indices, new_indices

        return new_indices, new_indices

    # retrieve features from feature matrix, given an artist idx. Return array of np.nans if artist idx is null
    def get_features(self, idx):
        return self.features.get(idx, np.repeat(np.nan, self.n_features))
        # if np.isnan(idx):
        #     return np.repeat(np.nan,self.features.shape[1])
        # else:
        #     return self.features[int(idx)]

    # Core preprocessing code. Can take in raw text files, or pickle files (in which case feature/dist values are updated appropriately)
    def processor(
        self,
        fi,
        output_dir,
        is_sorted=True,
        features=None,
        dist="cosine",
        session_threshold=None,
        dist_threshold=0.2,
        min_patch_length=5,
        artist_idx_feature_map=None,
    ):

        # get user_id from filename
        user = self.userFromFile(fi)
        self.rootLogger.debug("processor called (user {})".format(user))

        if fi.endswith(".txt"):

            if output_dir is None:
                raise ("output path must be specified!")
            if artist_idx_feature_map is None:
                raise ("artist_idx_feature_map_path must be provided!")

            df = pd.read_table(fi, header=None, names=["artist_id", "ts"], parse_dates=["ts"])
            if not is_sorted:
                df = df.sort_values(by="ts")

            df["td"] = df["ts"] - df.shift(1)["ts"]
            df["td"] = df["td"].astype(int) / 10 ** 9
            df["artist_idx"] = df["artist_id"].apply(lambda x: artist_idx_feature_map.get(x))
            n = float(len(df))
            n_null = df["artist_idx"].isnull().sum()
            notnull = n - n_null
            propnull = n_null / n
            if notnull < 1000 or (propnull >= 0.05):
                self.rootLogger.info(
                    "User {} SKIPPED ({} non null, {:.1f}% null) ({})".format(user, notnull, 100 * propnull, fi)
                )
                return None
            self.rootLogger.debug("DF loaded (user {})".format(user))

        elif fi.endswith(".pkl"):
            df = pd.read_pickle(fi)

        # get features and calculate distances
        if features is not None:
            # df['features'] = df['artist_idx'].apply(lambda idx: self.get_features(idx))
            # df['features_shift'] = df['features'].shift(1)
            df["prev"] = df["artist_idx"].shift(1)

            df["dist"] = df.apply(lambda row: self.calc_dist(row["artist_idx"], row["prev"], metric=dist), axis=1)

            self.rootLogger.debug("features and dists done (user {})".format(user))

        if session_threshold == 0:
            df["session"] = 0

        elif (session_threshold is not None) and (session_threshold > 0):
            if "td" not in df.columns:
                df["td"] = df["ts"] - df.shift(1)["ts"]
                df["td"] = df["td"].astype(int) / 10 ** 9
            session_idx = 0
            session_indices = []
            for val in df["td"] >= session_threshold:
                if val:
                    session_idx += 1
                session_indices.append(session_idx)
            df["session"] = session_indices
            self.rootLogger.debug("session indices done (user {})".format(user))

        if (min_patch_length is not None) and (dist_threshold is not None):

            self.rootLogger.debug("starting patch segmentation for user {})".format(user))

            indices_shuffle = np.zeros(len(df), dtype=int)
            indices_simple = np.zeros(len(df), dtype=int)
            offset_shuffle = 0
            idx_shuffle = 0
            offset_simple = 0
            idx_simple = 0

            ### NEED TO REWORK THIS BIT TO LOSE SOME REDUNDANCY

            for session in df.groupby("session"):
                result_shuffle, result_simple = self.patch_segmenter(
                    session[1], metric=dist, min_length=min_patch_length, dist_thresh=dist_threshold
                )
                # if session[0]==0:
                #    print result_shuffle,result_simple
                #    sys.exit()
                n = len(session[1])

                if len(result_shuffle) == 0:
                    indices_shuffle[offset_shuffle : offset_shuffle + n] = idx_shuffle
                    idx_shuffle += 1
                else:
                    indices_shuffle[offset_shuffle : offset_shuffle + result_shuffle[0]] = idx_shuffle
                    idx_shuffle += 1
                    for v, w in self.pairwise(result_shuffle):
                        indices_shuffle[offset_shuffle + v : offset_shuffle + w] = idx_shuffle
                        idx_shuffle += 1
                    indices_shuffle[
                        offset_shuffle + result_shuffle[-1] : offset_shuffle + result_shuffle[-1] + n
                    ] = idx_shuffle
                    idx_shuffle += 1
                offset_shuffle += n

                if len(result_simple) == 0:
                    indices_simple[offset_simple : offset_simple + n] = idx_simple
                    idx_simple += 1
                else:
                    indices_simple[offset_simple : offset_simple + result_simple[0]] = idx_simple
                    idx_simple += 1
                    for v, w in self.pairwise(result_simple):
                        indices_simple[offset_simple + v : offset_simple + w] = idx_simple
                        idx_simple += 1
                    indices_simple[
                        offset_simple + result_simple[-1] : offset_simple + result_simple[-1] + n
                    ] = idx_simple
                    idx_simple += 1
                offset_simple += n

            if result_shuffle:
                indices_shuffle[offset_shuffle + result_shuffle[-1] :] = idx_shuffle
            else:
                indices_shuffle[offset_shuffle:] = idx_shuffle

            if result_simple:
                indices_simple[offset_simple + result_simple[-1] :] = idx_simple
            else:
                indices_simple[offset_simple:] = idx_simple

            df["patch_idx_shuffle"] = indices_shuffle
            df["patch_idx_simple"] = indices_simple

            self.rootLogger.debug("patch indices done (user {})".format(user))

            # add artist block info
            ### https://stackoverflow.com/questions/14358567/finding-consecutive-segments-in-a-pandas-data-frame
            ### -1 for zero-based indexing

            df["block"] = ((df["artist_idx"].shift(1) != df["artist_idx"]).astype(int).cumsum()) - 1

            self.rootLogger.debug("artist blocks done (user {})".format(user))

        cols = ["ts", "artist_idx", "dist", "session", "patch_idx_shuffle", "patch_idx_simple", "block"]

        df = df[list(set(df.columns).intersection(cols))]
        if self.args.save:
            df.to_pickle("{}{}.pkl".format(output_dir, user))

        if self.args.patch_len_dist:
            self.patch_length_distributions(user, df, bins=np.arange(0, 1001, 1), method=self.args.patch_len_dist)
            # return user,vals_simple,vals_shuffle

        self.rootLogger.info("User {} processed successfully ({})".format(user, fi))
        return None

    # calculate patch summary measures (mean feature array, diversity, etc.). Applied to each patch
    def patch_measures(self, df, agg_stats=True, metric="cosine"):
        first = df.iloc[0]
        n = len(df)
        start = first["ts"]
        if agg_stats:
            # artists = df['artist_idx'].values
            if (n == 1) or (len(df["artist_idx"].unique()) == 1):
                diversity = 0.0
                centroid = first["features"]
            else:
                features = np.array([f for f in df["features"]])
                # I expect to see RuntimeWarnings in this block
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=RuntimeWarning)
                    centroid = np.nanmean(features, axis=0)
                    diversity = np.nanmean(pdist(features, metric=metric))

            # return pd.Series({'diversity':diversity,'centroid':centroid,'start_ts':start,'n':n,'artists':artists})
            return pd.Series({"diversity": diversity, "centroid": centroid, "start_ts": start, "n": n})

    # generate patch summary for each user, and save resulting pickle
    def patch_summary(self, fi, basis, metric):
        user = self.userFromFile(fi)
        df = pd.read_pickle(fi)
        df["features"] = df["artist_idx"].apply(lambda idx: self.get_features(idx))
        if basis == "block":
            agg_stats = False
        elif basis in ("patch_idx_shuffle", "patch_idx_simple"):
            agg_stats = True
        else:
            raise ("Invalid patch basis")
        result = df.groupby(basis).apply(self.patch_measures, agg_stats, metric)
        # result['start_idx'] = result['n'].cumsum().shift(1).fillna(0).astype(int)
        result.reset_index(drop=True).to_pickle("{}{}_patches_{}.pkl".format(self.args.resultdir, user, basis))
        self.rootLogger.info("Patches processed for user {} successfully ({})".format(user, fi))

    # run patch summaries for all users
    def summarize_patches(self):

        if self.args.file:
            self.patch_summary(fi=self.args.file, basis=self.args.patch_basis, metric=self.args.distance_metric)

        else:

            if self.args.skip_complete:
                done = set(
                    [
                        self.userFromFile(fi)
                        for fi in glob(self.args.pickledir + "*.pkl")
                        if "_patches_" in fi and fi.startswith(self.args.prefix_output)
                    ]
                )
            else:
                done = set()
            files = [
                fi
                for fi in glob(self.args.pickledir + "*.pkl")
                if "_patches_" not in fi and fi.startswith(self.args.prefix_input) and self.userFromFile(fi) not in done
            ]
            func_partial = partial(self.patch_summary, basis=self.args.patch_basis, metric=self.args.distance_metric)
            self.rootLogger.info("Pool started")
            self.pool.map(func_partial, files)
            self.pool.close()
            self.rootLogger.info("Pool closed")

    def patch_length_distributions(self, user, df, bins, method):
        n_listens = float(len(df))
        if self.args.min_patch_length == 2:
            vc_simple = df["patch_idx_simple"].value_counts().values
            counts_simple = np.clip(vc_simple, 0, 1000)
            vals_simple = np.histogram(counts_simple, bins=bins)[0]
            listens_simple = np.array([i * c for i, c in enumerate(vals_simple)])
            listens_simple[-1] = vc_simple[vc_simple >= 1000].sum()
            listens_simple = listens_simple / n_listens

            vc_block = df["block"].value_counts().values
            counts_block = np.clip(vc_block, 0, 1000)
            vals_block = np.histogram(counts_block, bins=bins)[0]
            listens_block = np.array([i * c for i, c in enumerate(vals_block)])
            listens_block[-1] = vc_block[vc_block >= 1000].sum()
            listens_block = listens_block / n_listens

            with open(self.args.resultdir + user, "a") as fout:
                fout.write(
                    "\t".join([user, "block", "patches", str(self.args.dist_thresh), str(self.args.min_patch_length)])
                    + "\t"
                    + ",".join(vals_block.astype(str))
                    + "\n"
                )
                fout.write(
                    "\t".join([user, "block", "listens", str(self.args.dist_thresh), str(self.args.min_patch_length)])
                    + "\t"
                    + ",".join(listens_block.astype(str))
                    + "\n"
                )
                fout.write(
                    "\t".join([user, "simple", "patches", str(self.args.dist_thresh), str(self.args.min_patch_length)])
                    + "\t"
                    + ",".join(vals_simple.astype(str))
                    + "\n"
                )
                fout.write(
                    "\t".join([user, "simple", "listens", str(self.args.dist_thresh), str(self.args.min_patch_length)])
                    + "\t"
                    + ",".join(listens_simple.astype(str))
                    + "\n"
                )

        vc_shuffle = df["patch_idx_shuffle"].value_counts().values
        counts_shuffle = np.clip(vc_shuffle, 0, 1000)
        vals_shuffle = np.histogram(counts_shuffle, bins=bins)[0]
        listens_shuffle = np.array([i * c for i, c in enumerate(vals_shuffle)])
        listens_shuffle[-1] = vc_shuffle[vc_shuffle >= 1000].sum()
        listens_shuffle = listens_shuffle / n_listens

        with open(self.args.resultdir + user, "a") as fout:
            fout.write(
                "\t".join([user, "shuffle", "patches", str(self.args.dist_thresh), str(self.args.min_patch_length)])
                + "\t"
                + ",".join(vals_shuffle.astype(str))
                + "\n"
            )
            fout.write(
                "\t".join([user, "shuffle", "listens", str(self.args.dist_thresh), str(self.args.min_patch_length)])
                + "\t"
                + ",".join(listens_shuffle.astype(str))
                + "\n"
            )

    def mean_block_distances(self, fi, n=100, shuffle=False):
        def hash_handler(a, frst):
            if frst > a:
                frst, a = a, frst
            if frst not in dhash:
                dhash[frst] = {}
                result = self.calc_dist(frst, a)
                dhash[frst][a] = result
            else:
                result = dhash[frst].get(a)
                if result is None:
                    result = self.calc_dist(frst, a)
                    dhash[frst][a] = result
            return result

        def cos_nan(arr1, arr2):
            if np.any(np.isnan(arr1)) or np.any(np.isnan(arr2)):
                return np.nan
            else:
                return cosine(arr1, arr2)

        user = fi.split("/")[-1][:-4]
        df = pd.read_pickle(fi)

        if os.path.exists(self.args.resultdir + user):
            levels = {"scrobble": False, "block": False, "D": True, "W": False, "M": False}
        else:
            levels = {"scrobble": True, "block": True, "D": True, "W": True, "M": True}

        if levels["scrobble"]:
            result = []
            dhash = {}
            if shuffle:
                blocks = df.copy()
                idx = np.array(blocks.index)
                np.random.shuffle(idx)
                blocks = blocks.reindex(idx)
            blocks = df.copy()
            for i in xrange(len(blocks) - n):
                first = blocks["artist_idx"].iloc[i]
                result.append(np.array(df["artist_idx"][i + 1 : i + n + 1].apply(lambda val: hash_handler(val, first))))
            result = np.nanmean(np.vstack(result), 0)
            with open(self.args.resultdir + user, "a") as fout:
                fout.write("\t".join([user, "scrobble", ",".join(result.astype(str))]) + "\n")

        if levels["block"]:
            result = []
            blocks = df[["artist_idx", "block"]].groupby("block").first()
            if shuffle:
                idx = np.array(blocks.index)
                np.random.shuffle(idx)
                blocks = blocks.reindex(idx)
            for i in xrange(len(blocks) - n):
                first = blocks["artist_idx"].iloc[i]
                result.append(
                    np.array(blocks["artist_idx"][i + 1 : i + 101].apply(lambda val: hash_handler(val, first)))
                )
            result = np.nanmean(np.vstack(result), 0)
            with open(self.args.resultdir + user, "a") as fout:
                fout.write("\t".join([user, "block", ",".join(result.astype(str))]) + "\n")

        df["features"] = df["artist_idx"].apply(lambda idx: self.get_features(idx))
        df = df.set_index("ts")["features"]

        for res, n in (("D", 365), ("W", 52), ("M", 12)):
            if levels[res]:
                result = []
                blocks = df.resample(res).aggregate(
                    lambda ser: np.nanmean(np.vstack(ser.values), axis=0)
                    if len(ser) > 0
                    else np.repeat(np.nan, self.n_features)
                )
                if shuffle:
                    idx = np.array(blocks.index)
                    np.random.shuffle(idx)
                    blocks = blocks.reindex(idx)
                for i in xrange(len(blocks) - n):
                    first = blocks.iloc[i]
                    result.append(np.array(blocks[i + 1 : i + n + 1].apply(lambda val: cos_nan(val, first))))
                result = np.nanmean(np.vstack(result), 0)
                with open(self.args.resultdir + user, "a") as fout:
                    fout.write("\t".join([user, res, ",".join(result.astype(str))]) + "\n")

        self.rootLogger.info("Block distances for user {} processed successfully ({})".format(user, fi))

    def blockgaps(self, fi):
        user = self.userFromFile(fi)
        result = []
        df = pd.read_pickle(fi)[["ts", "artist_idx", "block"]].groupby("block").first()
        bins = np.arange(0, 31, 1)
        day = np.timedelta64(1, "D")
        for artist in df["artist_idx"].dropna().unique():
            current = df[df["artist_idx"] == artist]["ts"]
            td = ((current - current.shift(1)).dropna()) / day
            vals = np.histogram(td, bins=bins)[0]
            result.append(vals / float(vals.sum()))
        result = np.nanmean(np.vstack(result), 0)
        with open(self.args.resultdir + user, "w") as fout:
            fout.write("\t".join([user, ",".join(result.astype(str))]) + "\n")
        self.rootLogger.info("Gap times for user {} processed successfully ({})".format(user, fi))

    def scrobble_gaps(self, fi):
        user = self.userFromFile(fi)
        result = []
        df = pd.read_pickle(fi)["ts"]
        bins = np.arange(0, 60 * 60 * 24 * 30, 120)
        td = (df - df.shift(1)).dropna().apply(lambda x: x.total_seconds())
        vals = np.histogram(td, bins=bins)[0]
        result = vals / float(vals.sum())
        with open(self.args.resultdir + user, "w") as fout:
            fout.write("\t".join([user, ",".join(result.astype(str))]) + "\n")
        self.rootLogger.info("Gap times for user {} processed successfully ({})".format(user, fi))

    def ee_artists(self, fi):
        user = self.userFromFile(fi)
        blocks = pd.read_pickle(fi)["block"]
        result = blocks.value_counts().value_counts()
        arr = result.reindex(xrange(1, max(result.index) + 1), fill_value=0.0).values
        final_result = arr / (np.cumsum(arr[::-1])[::-1])
        final_result = sparse.csr_matrix(final_result)

        with open(self.args.resultdir + user, "w") as fout:
            fout.write(
                user
                + "\t"
                + ":".join(
                    [",".join(a.astype(str)) for a in final_result.data, final_result.indices, final_result.indptr]
                )
                + "\n"
            )
        self.rootLogger.info("User {} processed successfully ({})".format(user, fi))

    def ee_artists_2(self, fi):
        user = self.userFromFile(fi)
        blocks = pd.read_pickle(fi)["block"]
        cnts = pd.DataFrame({"n": blocks.value_counts().sort_index()})
        cnts["last-n"] = cnts["n"].shift(1)
        cnts["switch"] = cnts.apply(
            lambda row: 1
            if ((row["last-n"] == 1) and (row["n"] > 1)) or ((row["last-n"] > 1) and (row["n"] == 1))
            else 0,
            axis=1,
        )
        cnts["exp-idx"] = cnts["switch"].cumsum()
        result = cnts.groupby("exp-idx").apply(
            lambda grp: pd.Series({"n": len(grp), "exploit": 0})
            if grp["n"].iloc[0] == 1
            else pd.Series({"n": grp["n"].sum(), "exploit": 1})
        )[:-1]
        # result = cnts.groupby('exp-idx').apply(lambda grp: pd.Series({'n':len(grp),'exploit':0}) if grp['n'].iloc[0]==1 else pd.Series({'n':grp['n']iloc[-1],'exploit':1}))[:-1]
        arr_exploit = result[result["exploit"] == 1]["n"].value_counts()
        arr_exploit = arr_exploit.reindex(xrange(1, max(arr_exploit.index) + 1), fill_value=0.0).values
        arr_explore = result[result["exploit"] == 0]["n"].value_counts()
        arr_explore = arr_explore.reindex(xrange(1, max(arr_explore.index) + 1), fill_value=0.0).values

        final_result_exploit = arr_exploit / (np.cumsum(arr_exploit[::-1])[::-1])
        final_result_exploit = sparse.csr_matrix(final_result_exploit)

        final_result_explore = arr_explore / (np.cumsum(arr_explore[::-1])[::-1])
        final_result_explore = sparse.csr_matrix(final_result_explore)

        with open(self.args.resultdir + user, "w") as fout:
            fout.write(
                user
                + "\t"
                + "explore"
                + "\t"
                + ":".join(
                    [
                        ",".join(a.astype(str))
                        for a in final_result_explore.data, final_result_explore.indices, final_result_explore.indptr
                    ]
                )
                + "\n"
            )
            fout.write(
                user
                + "\t"
                + "exploit"
                + "\t"
                + ":".join(
                    [
                        ",".join(a.astype(str))
                        for a in final_result_exploit.data, final_result_exploit.indices, final_result_exploit.indptr
                    ]
                )
                + "\n"
            )
        self.rootLogger.info("User {} processed successfully ({})".format(user, fi))

    def ee_artists_dists(self, fi):
        user = self.userFromFile(fi)
        blocks = pd.read_pickle(fi)["block"]
        cnts = pd.DataFrame({"n": blocks.value_counts().sort_index()})
        cnts["last-n"] = cnts["n"].shift(1)
        cnts["switch"] = cnts.apply(
            lambda row: 1
            if ((row["last-n"] == 1) and (row["n"] > 1)) or ((row["last-n"] > 1) and (row["n"] == 1))
            else 0,
            axis=1,
        )
        cnts["exp-idx"] = cnts["switch"].cumsum()
        result = cnts.groupby("exp-idx").apply(
            lambda grp: pd.Series({"n": len(grp), "exploit": 0})
            if grp["n"].iloc[0] == 1
            else pd.Series({"n": grp["n"].sum(), "exploit": 1})
        )[:-1]
        # result = cnts.groupby('exp-idx').apply(lambda grp: pd.Series({'n':len(grp),'exploit':0}) if grp['n'].iloc[0]==1 else pd.Series({'n':grp['n']iloc[-1],'exploit':1}))[:-1]
        arr_exploit = result[result["exploit"] == 1]["n"].value_counts()
        arr_exploit = sparse.csr_matrix(
            arr_exploit.reindex(xrange(1, max(arr_exploit.index) + 1), fill_value=0.0).values
        )
        arr_explore = result[result["exploit"] == 0]["n"].value_counts()
        arr_explore = sparse.csr_matrix(
            arr_explore.reindex(xrange(1, max(arr_explore.index) + 1), fill_value=0.0).values
        )

        with open(self.args.resultdir + user, "w") as fout:
            fout.write(
                user
                + "\t"
                + "explore"
                + "\t"
                + ":".join([",".join(a.astype(str)) for a in arr_explore.data, arr_explore.indices, arr_explore.indptr])
                + "\n"
            )
            fout.write(
                user
                + "\t"
                + "exploit"
                + "\t"
                + ":".join([",".join(a.astype(str)) for a in arr_exploit.data, arr_exploit.indices, arr_exploit.indptr])
                + "\n"
            )
        self.rootLogger.info("User {} processed successfully ({})".format(user, fi))

    def block_len_dists(self, fi):
        user = self.userFromFile(fi)
        blocks = pd.read_pickle(fi)["block"]
        result = blocks.value_counts().value_counts()
        arr = result.reindex(xrange(1, max(result.index) + 1), fill_value=0.0).values
        final_result = sparse.csr_matrix(arr)
        with open(self.args.resultdir + user, "w") as fout:
            fout.write(
                user
                + "\t"
                + ":".join(
                    [",".join(a.astype(str)) for a in final_result.data, final_result.indices, final_result.indptr]
                )
                + "\n"
            )
        self.rootLogger.info("User {} processed successfully ({})".format(user, fi))