Example #1
0
    def certScanner (self) :
        p = Pool(nodes = 512)
        cprint ("[+] Keywords : " + " ".join(str(x) for x in self.keywordList), 'green')
        # self.allipAddrList = self.shuffleList()
        self.allipAddrList = [x for x in self.shuffleList() if self.region in x ]
        
        for self.tryipClass in self.allipAddrList:
            self.ipExtractResult = self.ipExtract(self.tryipClass.split("@")[0])
            _max = len(self.ipExtractResult)
            cprint ("[+] Scanning IP Addr Class : " + self.tryipClass + "\t-- Number of scan target is :" + str(len(self.ipExtractResult)), 'green')

            with tqdm(total=_max) as pbar:
                pbar.set_description("[+] Progressing : %s " %self.tryipClass)
                for i, domain in tqdm(enumerate(p.imap(self.certChecker, self.ipExtractResult))):
                    pbar.update()
                    if domain is not None:
                        self.resList.append(domain)
                pbar.close()
                p.terminate() # Like p.close()
                p.restart() # Like p.join()

            if self.resList:
                self.printRes()

            else:
                cprint ("[!] No kewords found on this IP class \n", 'red')

            time.sleep(1)
            self.ipExtractResult = []
            self.resList = []
Example #2
0
    def train_multiprocessed(self, iInputDS, iTime, iSignal, iHorizon):
        pool = Pool(self.mOptions.mNbCores)
        self.defineTransformations(iInputDS, iTime, iSignal)
        # print([transform1.mFormula for transform1 in self.mTransformList]);
        args = []
        for transform1 in self.mTransformList:
            arg = cTraining_Arg(transform1.get_name(""))
            arg.mSigDec = cSignalDecompositionOneTransform()
            arg.mSigDec.mOptions = self.mOptions
            arg.mSigDec.mExogenousData = self.mExogenousData
            arg.mInputDS = iInputDS
            arg.mTime = iTime
            arg.mSignal = iSignal
            arg.mHorizon = iHorizon
            arg.mTransformation = transform1
            arg.mOptions = self.mOptions
            arg.mExogenousData = self.mExogenousData
            arg.mResult = None

            args.append(arg)

        for res in pool.imap(run_transform_thread, args):
            # print("FINISHED_TRAINING" , res.mName);
            self.mSigDecByTransform[res.mTransformation.get_name(
                "")] = res.mSigDec
Example #3
0
def run_pool(cmd_list, threads=None):
    if threads:
        pool = Pool(threads)
    else:
        pool = Pool()

    list(tqdm.tqdm(pool.imap(run, cmd_list), total=len(cmd_list)))
Example #4
0
def get_installed_packages():
    """ get a list of all packages currently installed in the active environment """
    packages = []

    pool = Pool(4)

    # for dist in track(
    #     list(Distribution.discover()), description="[cyan]Grabbing dependency info"
    # ):
    #     packages.append(Package.from_dist(dist))

    dists = list(Distribution.discover())
    dists_num = len(dists)

    log.info("[bold]Found a total of {} distributions".format(dists_num),
             extra={"markup": True})

    for package_enum in enumerate(pool.imap(Package.from_dist, dists),
                                  start=1):
        package = package_enum[1]
        log.info("{0}/{1}: processed [bold cyan]{2} {3}[/bold cyan]".format(
            package_enum[0], dists_num, package.name, package.version),
                 extra={"markup": True})
        packages.append(package)

    return packages
Example #5
0
    def multi_Non_Tweep_friends(self, handle):
        min_position, links = self.get_tweets(handle)
        print("Scraping last 100 days of activity")

        while (True):
            min_position1, links1 = self.get_tweets(handle, min_position)
            links = links + links1
            if (min_position1 == None):
                break
            min_position = min_position1

        people_list = []

        link = [x for x in links if handle in x]
        link = self.duplicates(link)

        p = Pool(10)  # Pool tells how many at a time
        with Pool(10) as p:

            records = list(tqdm(p.imap(self.get_people, link),
                                total=len(link)))
            p.terminate()
            p.join()
            p.close()
            people_list = [item for sublist in records for item in sublist]
            people_list = self.duplicates(people_list)

        people_list = [x for x in people_list if x != handle]

        return (people_list)
Example #6
0
def q5_plot_chromatic_num_bounds_by_prob(n, prange, pstep, k=None,\
    clique_finder=greedy_find_clique_number, multi=False):
    """Plots a graph of number of colours against edge probability,
    for each of the various lower/upper bounds of chromatic number
    multi: True/False/int multiprocessing - yes/no/ num processes (default 4 if true)
    """
    probs = np.arange(prange[0], prange[1], pstep)
    graphs = [[get_random_graph(n, p, k) for _ in range(10)] for p in probs]
    mean_bounds = []
    pool = Pool(multi if type(multi) is int else 4)
    # graph_generator = pool.imap(multiprocessing_chrom_bounds_func, graphs) if multi else map(f, graphs)
    f = lambda graphs_list: list(map(get_chromatic_number_bounds, graphs_list))
    graph_generator = pool.imap(f, graphs) if multi else map(f, graphs)

    for bounds in tqdm.tqdm(graph_generator, total=len(graphs)):
        mean_bounds.append(np.mean(bounds, axis=0))

    pool.close()
    pool.join()

    mean_bounds = np.array(mean_bounds)
    plt.figure()
    for i, label in zip(range(mean_bounds.shape[1]), \
        ['lb_comp', 'lb_clique', 'ub_clique', 'ub_greedy_rand', 'ub_greedy_msd']):
        plt.plot(probs, mean_bounds[:, i], label=label)
    plt.legend()

    return probs, mean_bounds
Example #7
0
def goo():
    pool = Pool(4)
    #    def f(x):
    #        return foo(100 + x)
    stuff = list(tqdm.tqdm(pool.imap(foo, range(20)), total=20))
    print(stuff)
    print('aaa')
    pool.close()
    pool.join()
    print('bbb')
Example #8
0
 def computePerfsInParallel(self, args):
     lModels = {};
     # print([arg.mName for arg in args]);
     # print([arg.mModel.mOutName for arg in args]);
     pool = Pool(self.mOptions.mNbCores)
     # results = [compute_perf_func(arg) for arg in args];
     for res in pool.imap(compute_perf_func, args):
         # print("FINISHED_PERF_FOR_MODEL" , res.mName);
         lModels[res.mName] = res.mModel;
     
     # pool.close()
     # pool.join()
     return lModels;
Example #9
0
def combine_scores():
    """Combine the scores from all patients and dump into all_dict.txt.
    """

    all_dicts = {}
    duration_dict = {}
    all_dict_q = multiprocessing.Manager().Queue()
    duration_dict_q = multiprocessing.Manager().Queue()
    dirs = [
        y for y in os.listdir(patient_dir)
        if os.path.isdir(os.path.join(patient_dir, y))
    ]
    bar = progressbar.ProgressBar(redirect_stdout=True, max_value=len(dirs))
    f = functools.partial(scores_and_duration_dict, all_dict_q,
                          duration_dict_q)
    p = Pool()

    for i, _ in enumerate(p.imap(f, dirs, chunksize=50), 1):
        bar.update(i)
    p.close()
    p.join()

    while not all_dict_q.empty():
        patient_dict = all_dict_q.get()
        dur_dict = duration_dict_q.get()

        for i in patient_dict:
            print(i)

            if i not in all_dicts:
                all_dicts[i] = patient_dict[i]
            else:
                all_dicts[i].update(patient_dict[i])

        for i in dur_dict:
            print(i)

            if i not in duration_dict:
                duration_dict[i] = dur_dict[i]
            else:
                duration_dict[i].update(dur_dict[i])
    print('done combining scores, dumping...')
    json.dump(all_dicts, open(os.path.join(patient_dir, 'all_dict.txt'), 'w'))
    json.dump(duration_dict,
              open(os.path.join(patient_dir, 'duration_dict.txt'), 'w'))
Example #10
0
        # Ignore black images
        if slice.max() == 0 and slice.min() == 0:
            continue

        file_name = os.path.join(spectogram_dir_path, "{}{}.png".format(index, i))
        scipy.misc.imsave(file_name, np.squeeze(slice))

    return f


if __name__ == '__main__':
    dataset_dir = '/media/work/audio/musiclid/youtube_spoken/'
    output_path_raw = os.path.join(dataset_dir, 'raw')
    output_path_spectograms = os.path.join(dataset_dir, 'spectograms')

    pool = Pool()

    for language in os.listdir(output_path_raw):
        for source_name in os.listdir(os.path.join(output_path_raw, language)):
            files = glob.glob(os.path.join(output_path_raw, language, source_name, "*.mp3"))

            spectogram_dir_path = os.path.join(output_path_spectograms, language, source_name)
            if not os.path.exists(spectogram_dir_path):
                os.makedirs(spectogram_dir_path)

            data = [(f, spectogram_dir_path, i) for i, f in enumerate(files)]
            for f in tqdm(pool.imap(segment_file, data), 'spectograms for {}/{}'.format(language, source_name),
                          total=len(files)):
                pass
Example #11
0
class analyze(setup.setup):

    def __init__(self,args,logging_level=logging.INFO):

         super(analyze, self ).__init__(args,logging_level)


    # set up processing pool and run all analyses specified in args
    def run(self):


        if self.args.jumpdists:
            n_bins=100.
            bin_width = 1/n_bins
            bins = np.arange(0,1+bin_width,1/n_bins)

            if self.args.file:
                user,vals = self.artist_jump_distributions(self.args.file,bins=bins,self_jumps=False)
                with open(self.args.resultdir+user,'w') as fout:
                    fout.write(','.join(vals.astype(str))+'\n')



            else:
                raise('not implemented!')
                self.pool = Pool(self.args.n)
                self.rootLogger.info("Pool started")

                self.rootLogger.info("Starting jump distance analysis")

                func_partial = partial(self.artist_jump_distributions,bins=bins,self_jumps=False)
                with open(self.args.resultdir+'jumpdists','w') as fout:
                    for user,vals in self.pool.imap(func_partial,self.listen_files):
                        fout.write(user+'\t'+','.join(vals.astype(str))+'\n')

                self.pool.close()
                self.rootLogger.info("Pool closed")

        if self.args.blockdists:
            #self.rootLogger.info("Starting block distance analysis")
            self.mean_block_distances(self.args.file)

        if self.args.diversity_dists:
            bins = np.arange(0,1.01,.01)
            self.diversity_distributions(self.args.file,bins=bins)

        if self.args.clustering:
            self.clustering(self.args.file)

        if self.args.values:
            self.patch_values(self.args.file)

        if self.args.exp:
            self.explore_exploit(self.args.file)

        if self.args.patch_len_dists:
            self.patch_len_dists(self.args.file)


    # calculate distribution (using histogram with specified bins)
    # of sequential artist-to-artist distances
    def artist_jump_distributions(self,fi,bins,self_jumps=False):
        user = fi.split('/')[-1][:-4]
        df = pd.read_pickle(fi)
        if self_jumps:
            vals = np.histogram(df['dist'].dropna(),bins=bins)[0]
        else:
            vals = np.histogram(df['dist'][df['dist']>0],bins=bins)[0]
        self.rootLogger.info('artist jump distances done for user {} ({})'.format(user,fi))
        return user,vals

    # calculate distribution (using histogram with specified bins)
    # of patch diversity for each user

    # awk 'FNR==1' * > diversity_dists_zeros
    # awk 'FNR==2' * > diversity_dists_nozeros
    def diversity_distributions(self,fi,bins):
        if 'patches' not in fi:
            raise('WRONG DATATYPE')
        user = fi.split('/')[-1].split('_')[0]
        df = pd.read_pickle(fi).dropna(subset=['diversity'])
        zeros = np.histogram(df[df['n']>=5]['diversity'],bins=bins)[0]
        nozeros = np.histogram(df[(df['n']>=5)&(df['diversity']>0)]['diversity'],bins=bins)[0]

        zeros = zeros/float(zeros.sum())
        nozeros = nozeros/float(nozeros.sum())

        with open(self.args.resultdir+user,'w') as fout:
            fout.write(user+'\t'+'zeros'+'\t'+','.join(zeros.astype(str))+'\n')
            fout.write(user+'\t'+'nozeros'+'\t'+','.join(nozeros.astype(str))+'\n')
        self.rootLogger.info('diversity distributions done for user {} ({})'.format(user,fi))


    def mean_block_distances(self,fi,n=100):

        def cos_nan(arr1,arr2):
            if np.any(np.isnan(arr1)) or np.any(np.isnan(arr2)):
                return np.nan
            else:
                return cosine(arr1,arr2)


        user = fi.split('/')[-1].split('_')[0]
        df = pd.read_pickle(fi)
        blocks = df[df['n']>=5].dropna()

        result = []
        for i in xrange(len(blocks)-n):
            first = blocks['centroid'].iloc[i]
            result.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first))))
        result = np.nanmean(np.vstack(result),0)

        with open(self.args.resultdir+user,'w') as fout:
            fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n')

        self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi))


        # now shuffled
        # idx = np.array(blocks.index)
        # np.random.shuffle(idx)
        # blocks = blocks.reindex(idx)

        # result_random = []
        # for i in xrange(len(blocks)-n):
        #     first = blocks['centroid'].iloc[i]
        #     result_random.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first))))
        # result_random = np.nanmean(np.vstack(result_random),0)

        # with open(self.args.resultdir+user,'w') as fout:
        #     fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n')
        #     fout.write('\t'.join([user,'patch_random',','.join(result_random.astype(str))])+'\n')
        # self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi))

    def clustering(self,fi):
        df = pd.read_pickle(fi)
        user = fi.split('/')[-1].split('_')[0]

        mask = (df['centroid'].apply(lambda arr: ~np.any(np.isnan(arr))).values)&(df['n']>=5)&(df['diversity']<=0.2)
        clust_data = df[mask].reset_index()
        arr =  np.vstack(clust_data['centroid'])
        Z = linkage(arr, 'complete')
        clusters = fcluster(Z,t=0.2,criterion='distance')
        assignments = np.repeat(np.nan,len(df))
        assignments[np.where(mask)] = clusters
        df['patch_clust'] = assignments
        df.to_pickle('{}{}.pkl'.format(self.args.resultdir,user))
        self.rootLogger.info('Patch clusters for user {} processed successfully ({})'.format(user,fi))

    def patch_len_dists(self,fi):
        df = pd.read_pickle(fi)
        user = fi.split('/')[-1][:-4]

        explore = df[np.isnan(df['patch_clust'])]
        result_explore = explore['n'].value_counts()

        df['explore'] = np.isnan(df['patch_clust']).astype(int)
        df['explore-idx'] = df['explore'].cumsum()

        result_exploit =  df.groupby('explore-idx').apply(lambda df: df.dropna()['n'].sum()).value_counts()

        result_explore = result_explore.reindex(xrange(1,max(result_explore.index)+1),fill_value=0.).values
        result_exploit = result_exploit.reindex(xrange(1,max(result_exploit.index)+1),fill_value=0.).values

        result_explore = sparse.csr_matrix(result_explore)
        result_exploit = sparse.csr_matrix(result_exploit)


        with open(self.args.resultdir+user,'w') as fout:
            fout.write(user+'\t'+'explore'+'\t'+':'.join([','.join(a.astype(str)) for a in result_explore.data,result_explore.indices,result_explore.indptr])+'\n')
            fout.write(user+'\t'+'exploit'+'\t'+':'.join([','.join(a.astype(str)) for a in result_exploit.data,result_exploit.indices,result_exploit.indptr])+'\n')
        self.rootLogger.info('User {} processed successfully ({})'.format(user,fi))



    def explore_exploit(self,fi):

        user = fi.split('/')[-1][:-4]

        df_patches_raw = pd.read_pickle(fi)

        # add time in next bout
        df_patches_raw['next_n'] = df_patches_raw['n'].shift(-1)

        # add patch values
        # listensPerPatch = df_patches_raw.groupby('patch_clust')['n'].sum()
        # overall_prop = listensPerPatch/float(df_patches_raw['n'].sum())
        # overall_prop.name = 'final_value'
        # df_patches_raw = df_patches_raw.join(overall_prop,on='patch_clust')


        """
        # time in next exploit patch as function of exploration time
        result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['next_n'].mean()

        fout.write(user+'\t'+'next-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """
        # total time exploiting as a function of time exploring
        df_patches_raw['explore'] = np.isnan(df_patches_raw['patch_clust']).astype(int)
        df_patches_raw['explore-idx'] = df_patches_raw['explore'].cumsum()

        # combine all exploit listens
        #grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'][1:].sum()]}))

        # only last exploit bout
        grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'].iloc[-1]]}))

        #result = grp_explore.groupby('n')['n-exploit'].mean()
        #fout.write(user+'\t'+'total-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """
        # exploration time as a function of exploitation time
        grp_exploit = grp_explore.copy()
        grp_exploit['n-explore'] = grp_exploit['n'].shift(-1)

        result = grp_exploit.groupby('n-exploit')['n-explore'].mean()
        fout.write(user+'\t'+'explore-vs-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """

        # prob exploit given explore time - already done

        # explore_only = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])]
        # result = explore_only['n'][:-1].value_counts()
        # arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values
        # final_result = arr/(np.cumsum(arr[::-1])[::-1])
        # final_result = sparse.csr_matrix(final_result)

        # with open(self.args.resultdir+user+'_exploit','w') as fout:
        #     fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n')


        # prob explore given exploit time
        result = grp_explore['n-exploit'][grp_explore['n-exploit']>0].value_counts()
        arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values
        final_result = arr/np.cumsum(arr[::-1])[::-1]
        final_result = sparse.csr_matrix(final_result)

        with open(self.args.resultdir+user+'_explore','w') as fout:
            fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n')


        #fout.write(user+'\t'+'prob-explore-given-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')

        """
        # patch value as a function of exploration time
        df_patches_raw['final_value_next'] = df_patches_raw['final_value'].shift(-1)
        result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['final_value_next'].mean()
        fout.write(user+'\t'+'exploit-value-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """

        self.rootLogger.info('User {} processed successfully ({})'.format(user,fi))
Example #12
0

if __name__ == '__main__':
    output_path_raw = '/media/work/audio/musiclid/youtube_spoken/raw'
    output_path_segmented = '/media/work/audio/musiclid/youtube_spoken/segmented'

    pool = Pool()

    for language in os.listdir(output_path_raw):
        for source_name in os.listdir(os.path.join(output_path_raw, language)):
            files = glob.glob(
                os.path.join(output_path_raw, language, source_name, "*.mp3"))

            segment_dir_path = os.path.join(output_path_segmented, language,
                                            source_name)
            if not os.path.exists(segment_dir_path):
                os.makedirs(segment_dir_path)

            data = [(f, segment_dir_path) for f in files]
            data = []
            for f in tqdm(pool.imap(segment_file, data),
                          'segmenting files in {}/{}'.format(
                              language, source_name),
                          total=len(files)):
                pass

            file_counter[language] += len(
                glob.glob(os.path.join(segment_dir_path, "*.wav")))

    print file_counter
Example #13
0
def main(argv):

  parser = ArgumentParser(description="A baseball pitch simulator.")

  parser.add_argument("config_file",
                      action="store",
                      help="Configuration file." )
  parser.add_argument("-l", "--list-pitchers",
                      action="store_true",
                      help="List the name of all built-in pitcher available for use." )
  parser.add_argument("-p", "--show-performance",
                      action="store_true",
                      help="Generate plot(s) showing model performance after training." )



  args = parser.parse_args(argv)

  if args.list_pitchers:
    pitchers = [ p for p in dir(Pitchers) if isinstance( getattr(Pitchers,p), Pitchers.Pitcher ) ]
    print("Available pitchers:")
    for p in pitchers:
      print("\t",p)
    sys.exit(0)

  # load config
  config_file = pathlib.Path(args.config_file).resolve()
  with config_file.open() as f:
    conf = yaml.safe_load(f)

  pitcher = None
  pitcher_name = conf.get("pitcher",dict()).get("name",None)
  if pitcher_name is not None:
    pitcher = copy.deepcopy(getattr(Pitchers,pitcher_name))

  # setup data files
  training_conf = conf.get("training",dict())
  training_data_file_template = training_conf.get("training data file","{name}-training-data-{num}-{id}.yaml")
  input_model_file_template = training_conf.get("input model file","{name}-model-{id}.yaml")
  output_model_file_template = training_conf.get("output model file","{name}-model-{id}-new.yaml")
  num_training_trials = training_conf.get("number trials",1000)
  num_training_epochs = training_conf.get("number epochs",100)


  context = dict()
  context["name"] = pitcher_name
  context["id"] = pitcher.id()
  context["num"] = num_training_trials

  training_data_file = pathlib.Path(training_data_file_template.format(**context)).resolve()
  input_model_file = pathlib.Path(input_model_file_template.format(**context)).resolve()
  output_model_file = pathlib.Path(output_model_file_template.format(**context)).resolve()

  sim = Simulation()
  sim.configure(conf.get('simulation',dict()))

  if input_model_file.is_file():
    pitcher.aim_model.load(str(input_model_file))
  else:
    print(f"WARNING: '{str(input_model_file)}' does not exist.")
  losses = pitcher.train(sim,num_training_epochs,num_training_trials,training_data_file)

  if output_model_file.is_file():
    print(f"WARNING: '{str(output_model_file)}' already exists. It will be OVERWRITTEN.")
  pitcher.aim_model.save(str(output_model_file))

  print("Summary:")
  print(f"\tinitial training loss: {losses[0]}")
  print(f"\t  final training loss: {losses[-1]}")


  if args.show_performance:
    print("Evaluating pitcher")
    configs = list()

    aim_locations = list()
    for x in numpy.arange( -15, 15+1,5 ):
      for z in numpy.arange( 12,5*12+1, 12 ):
        aim_x = Q_(x,'inch')
        aim_z = Q_(z,'inch')
        config = pitcher.configure_throw( 1, Q_(100,'percent'), aim_z, aim_x)
        aim_locations.append( [aim_x,aim_z] )
        configs.append(config)

    def compute_location(config):
      trajectory = sim.run(config, terminate_function=lambda x: x[-1][2] < 0, record_all=False)
      act_x = Q_(trajectory[0][1],'m')
      act_z = Q_(trajectory[0][3],'m')
      return [act_x,act_z]

    pool = Pool()
    locations = list(tqdm.tqdm(pool.imap( compute_location, configs), total=len(configs)))

    txs = [ r[0].to("in").magnitude for r in aim_locations ]
    tzs = [ r[1].to("in").magnitude for r in aim_locations ]
    axs = [ r[0].to("in").magnitude for r in locations ]
    azs = [ r[1].to("in").magnitude for r in locations ]


    fig = go.Figure(data=[go.Scatter(x=txs,y=tzs,mode='markers'),go.Scatter(x=axs,y=azs,mode='markers')])
    fig.show()
def newspaperarchive_scraper(search_terms, start_date, end_date, filepath):
    #Define functions
    #Define date generator
    def perdelta(start, end, delta):
        curr = start
        while curr < end:
            yield curr
            curr += delta

    #Define URL grabber
    def newspaperarchive_url(search_terms, date):
        day = "&pd={0}".format(date.day)
        month = "&pm={0}".format(date.month)
        year = "&py={0}".format(date.year)
        search_terms = "&pep={0}".format(search_terms.replace(" ", "-"))
        url = "http://access.newspaperarchive.com/tags/?pci=7&ndt=ex" + day + month + year + search_terms + "&pr=100"
        return url

    def test_matches(html):
        test = BeautifulSoup(html, 'html.parser')
        test_result = test.find('h2', text="0 Results for search")
        return test_result

    def extract_articles(page):    
        #Grab articles
        articles = page.find_all('div', class_="searchResultBlock searchResultBlockWithThumb")
        return articles

    def extract_data(article, day):
        line = {}
        line['archive'] = "newspaperarchive"
        try:
            line["publication_title"] = article.h4.a.get_text().strip().encode('utf8')
        except:
            line["publication_title"] = ""
        line["href"] = article.a['href']    
        try:
            line['publication_id'] = re.search("(?<=http://access\.newspaperarchive\.com/)([^/]+/[^/]+/[^/]+/[^/]+)", line['href']).group(0)
        except:
            line['publication_id'] = ""
        line["search_date"] = day
        try:
            line['page'] = re.search("(?<=/page-)(\d\d?)", line['href']).group(0)
        except:
            line['page'] = ""
        line['search_terms'] = search_terms
        return line

    def scrape(search_terms, day):
        sleep(1)
        print day
        #Visit URL and parse
        url = newspaperarchive_url(search_terms, day)
        wait = 0
        while True:
            try:
                start = requests.get(url, timeout=(1,180)).text
                break
            except:
                print "... trying again ..." + str(day)
                sleep(1.5**wait)
                wait += 1

        #Are there any hits?
        if test_matches(start) == None:
            lines = []
            nextLink = []
            page = start
            page_number = 2

            while nextLink != None:
                soup = BeautifulSoup(page, 'html.parser')
                articles = extract_articles(soup)
                #extract article data
                for article in articles:
                    lines.append(extract_data(article, day))

                #Get nextLink
                try:
                    nextLink = soup.find('a', text=page_number)['href']
                    wait = 0
                    while True:
                        try:
                            page = requests.get(nextLink, timeout=(1,180)).text
                            break
                        except:
                            print "... trying again ..." + str(day)
                            sleep(1.5**wait)
                            wait += 1
                    page_number += 1
                except TypeError:
                    nextLink = None           

            return lines

        else:
            return None

    #Complete scraper
    #Parallel processing
    if __name__ == "__main__":
        #Create file name
        timeperiod = str(start_date) + "to" + str(end_date - timedelta(days=1))
        filename = "newspaperarchive-" + timeperiod + ".csv"

        pool = Pool(10)

        date_list = []
        for date in perdelta(start_date, end_date, timedelta(days=1)):
            date_list.append(date)

        search_terms_list = [search_terms] * len(date_list)

        result_iter = pool.imap(scrape, search_terms_list, date_list)

        #Create CSV
        fields = ["archive", "publication_title", "publication_id", "search_date", "page", "href", "search_terms"]
        with open("/".join((filepath,filename)), "w") as w:
            writer = csv.DictWriter(w, fieldnames=fields)
            writer.writeheader()
            for result in result_iter:
                if result != None:
                    writer.writerows(result)
Example #15
0
    def init_data(self, data_name, n_chunk=1024):
        print(f'Initializing {data_name} data...')

        def transform_triple_to_hrt(triple_idx):
            """ Transforms triple-idx (as a whole) to h/r/t format """
            if triple_idx == -1:  # for response_triple
                return NAF_TRIPLE
            triple = self.idx2triple[triple_idx]
            h, r, t = triple.split(', ')
            return [self.word2idx[h], self.rel2idx[r], self.word2idx[t]]

        def process_file(root, inp):
            start_i, filename = inp
            n_sample = line_count(filename)

            post = np.zeros((n_sample, self.args.max_sentence_len),
                            dtype=np.int32)
            post_length = np.zeros(
                (n_sample), dtype=np.int32)  # valid length (without pad)
            response = np.zeros((n_sample, self.args.max_sentence_len),
                                dtype=np.int32)
            response_length = np.zeros((n_sample), dtype=np.int32)
            # post_triple = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32)
            triple = np.zeros((n_sample, self.args.max_sentence_len,
                               self.args.max_triple_len, 3),
                              dtype=np.int32)
            entity = np.zeros((n_sample, self.args.max_sentence_len,
                               self.args.max_triple_len),
                              dtype=np.int32)
            response_triple = np.zeros(
                (n_sample, self.args.max_sentence_len, 3), dtype=np.int32)

            max_post_len, max_response_len, max_triple_len = 0, 0, 0

            with jsonlines.open(filename) as df:
                for i, line in enumerate(df):

                    pl, rl = len(line['post']) + 2, len(line['response']) + 2
                    post_length[i] = pl
                    response_length[i] = rl

                    max_post_len = max(pl, max_post_len)
                    max_response_len = max(rl, max_response_len)
                    max_triple_len = max([len(l)
                                          for l in line['all_triples']] +
                                         [max_triple_len])

                    all_triples = [
                        line['all_triples'][i - 1] if i > 0 else [-1]
                        for i in line['post_triples']
                    ]

                    post[i, :pl] = [SOS_IDX] + [
                        self.get_word_idx(p) for p in line['post']
                    ] + [EOS_IDX]
                    response[i, :rl] = [SOS_IDX] + [
                        self.get_word_idx(r) for r in line['response']
                    ] + [EOS_IDX]
                    # post_triple[i, 1:pl-1] = np.array(line['post_triples']) # [0, 0, 1, 0, 2...]
                    response_triple[i, :rl] = [NAF_TRIPLE] + [
                        transform_triple_to_hrt(rt)
                        for rt in line['response_triples']
                    ] + [NAF_TRIPLE]

                    # put NAF_TRIPLE/entity at index 0
                    triple[i] = pad_2d(
                        [[NAF_TRIPLE]] +
                        [[transform_triple_to_hrt(t) for t in triples]
                         for triples in all_triples] + [[NAF_TRIPLE]],
                        length=(self.args.max_sentence_len,
                                self.args.max_triple_len, 3))
                    entity[i] = pad_2d(
                        [[NAF_IDX]] +
                        [[self.entidx2wordidx[e] for e in entities]
                         for entities in line['all_entities']] + [[NAF_IDX]],
                        length=(self.args.max_sentence_len,
                                self.args.max_triple_len))

                # dump to zarr
                root['post'][start_i:start_i + n_sample] = post
                root['post_length'][start_i:start_i + n_sample] = post_length
                root['response'][start_i:start_i + n_sample] = response
                root['response_length'][start_i:start_i +
                                        n_sample] = response_length
                # root['post_triple'][start_i : start_i+n_sample] = post_triple
                root['triple'][start_i:start_i + n_sample] = triple
                root['entity'][start_i:start_i + n_sample] = entity
                root['response_triple'][start_i:start_i +
                                        n_sample] = response_triple

            return max_post_len, max_response_len, max_triple_len

        toread = [
            f'{self.data_path}/{data_name}set_pieces/{piece}'
            for piece in os.listdir(f'{self.data_path}/{data_name}set_pieces')
        ]
        n_lines = sum([line_count(piece) for piece in toread])
        init_n_lines = math.ceil(
            n_lines /
            n_chunk) * n_chunk  # 마지막 조각 사이즈가 지정된 청크 사이즈보다 작아져서 나는 에러 방지

        root = zarr.open(f'{self.data_path}/{data_name}set_new.zarr', mode='w')
        post = root.zeros('post',
                          shape=(init_n_lines, self.args.max_sentence_len),
                          chunks=(n_chunk, None),
                          dtype='i4')
        post_length = root.zeros('post_length',
                                 shape=(init_n_lines, ),
                                 chunks=(n_chunk, ),
                                 dtype='i4')  # valid length (without pad)
        response = root.zeros('response',
                              shape=(init_n_lines, self.args.max_sentence_len),
                              chunks=(n_chunk, None),
                              dtype='i4')
        response_length = root.zeros('response_length',
                                     shape=(init_n_lines, ),
                                     chunks=(n_chunk, ),
                                     dtype='i4')
        post_triple = root.zeros('post_triple',
                                 shape=(init_n_lines,
                                        self.args.max_sentence_len),
                                 chunks=(n_chunk, None),
                                 dtype='i4')
        triple = root.zeros('triple',
                            shape=(init_n_lines, self.args.max_sentence_len,
                                   self.args.max_triple_len, 3),
                            chunks=(n_chunk, None, None, None),
                            dtype='i4')
        entity = root.zeros('entity',
                            shape=(init_n_lines, self.args.max_sentence_len,
                                   self.args.max_triple_len),
                            chunks=(n_chunk, None, None),
                            dtype='i4')
        response_triple = root.zeros('response_triple',
                                     shape=(init_n_lines,
                                            self.args.max_sentence_len, 3),
                                     chunks=(n_chunk, None, None),
                                     dtype='i4')

        pool = Pool(min(len(toread), mp.cpu_count()))
        func = functools.partial(process_file, root)
        iterinp = [(i * self.args.data_piece_size, filename)
                   for i, filename in enumerate(toread)]
        max_post_lens, max_response_lens, max_triple_lens = zip(
            *tqdm(pool.imap(func, iterinp), total=len(iterinp)))

        max_post_len, max_response_len, max_triple_len = max(
            max_post_lens), max(max_response_lens), max(max_triple_lens)

        # trim remaining space
        post.resize(n_lines, max_post_len)
        post_length.resize(n_lines)
        response.resize(n_lines, max_response_len)
        response_length.resize(n_lines)
        post_triple.resize(n_lines, max_post_len)
        triple.resize(n_lines, max_post_len, max_triple_len, 3)
        entity.resize(n_lines, max_post_len, max_triple_len)
        response_triple.resize(n_lines, max_response_len, 3)

        print(
            f'Dumped {data_name} at: {self.data_path}/{data_name}set_new.zarr')
Example #16
0
    def RunCutplan(self):
        # initialisations
        cbL = 3
        timer = True
        id = []
        # t = time.time()
        if id == []:
            start_id = sum(self.completed)
            id = list(range(start_id, self.Cutplans.shape[0]))
        numproc = cpu_count() - 2
        p = Pool(processes=numproc)
        # cpSched = self.Cutplans.iloc[id]
        total = len(id)*1000
        for cID in id:
            c = self.Cutplans.iloc[cID]
            # find desc to be used to open the correct log data file
            desc = c.Description[2:4]+"-"+str(int(c.Description[5:7])-1)
            # get data from log data file
            LD = pd.read_csv(self.logPath+desc+'.csv')

            # initialise recovery
            recovery = Recovery(c)
            iterLog = []
            iterC = []
            for i in range(LD.shape[0]):
                log = LD.iloc[i]
                iterLog.append(log)
                iterC.append(c)

# =============================================================================
#             completed = []
#             for lID in range(len(iterLog)):
#                 log = iterLog[lID]
#                 coords = GetLogCoords(log, c)
#                 completed.append(coords)
#                 Timer(id, cID, lID, time.time()-t)
# =============================================================================
            # if id.index(cID) > 0:
            #     p.restart()
            data = []
            data = p.imap(GetLogCoords, iterLog, iterC)
            completed = []
            i = 0
            while len(completed) < LD.shape[0]:
                try:
                    res = next(iter(data))
                    completed.append(res)
                    if self.abort:
                        self.abort = False
                        return
                    if timer:
                        count = id.index(cID)*1000 + len(completed)
                        self.l_progress.emit(count/total)
                        # Timer(id, cID, len(completed)-1, time.time()-t)
                except BaseException:
                    break

            self.AveR[cID] = Recovery(c)
            self.BoardBreakdown[cID] = BoardBreakdown(c)
            minW = [1000000, 1000000]
            minH = [1000000, 1000000]
            minWID = [0, 0]
            minHID = [0, 0]
            numOF = 0
            for lID in range(LD.shape[0]):
                coords = completed[lID]
                newW, newH = CalcUseable(coords)
                if newW < minW[0]:
                    minW[0] = newW
                    minWID[0] = lID
                if newH < minH[0]:
                    minH[0] = newH
                    minHID[0] = lID
                newW1, newH1 = CalcUseable(coords, cbL)
                if newW1 < minW[1]:
                    minW[1] = newW1
                    minWID[1] = lID
                if newH1 < minH[1]:
                    minH[1] = newH1
                    minHID[1] = lID
                # OF = recovery.RunRecoveryRand(coords, offS=2.725)
                # numOF += not OF
                OF = recovery.RunRecovery(coords)
                numOF += not OF
                self.AveR[cID].AddRecovery(recovery)
                self.BoardBreakdown[cID].AddRecovery(recovery)
                self.LogVol[cID, lID] = CalcBoardVol(
                    LD.iloc[lID], c, recovery
                )
                # recovery.RunRecovery(coords)
                # self.LogVol[cID, lID] = CalcBoardVol(
                #     LD.iloc[lID], c, recovery
                # )

            self.AveR[cID].AverageRecovery(LD.shape[0])
            self.BoardBreakdown[cID].AverageRecovery(LD.shape[0])
            self.MinHLog[cID][0] = minHID[0]
            self.MinWLog[cID][0] = minWID[0]
            self.MinW[cID][0] = minW[0]
            self.MinH[cID][0] = minH[0]
            self.MinHLog[cID][1] = minHID[1]
            self.MinWLog[cID][1] = minWID[1]
            self.MinW[cID][1] = minW[1]
            self.MinH[cID][1] = minH[1]
            self.OpenFacePerc[cID] = numOF/LD.shape[0]

            time.sleep(0.1)

            self.completed[cID] = True
            self.cp_progress.emit(cID)

        self.EstVol = np.nanmean(self.LogVol, 1) * np.array(
            self.Cutplans.LogCount)

        self.finished.emit()
        return self.AveR
Example #17
0
from pathos.multiprocessing import ProcessingPool as pool
from tqdm import tqdm


def F(X, lamda=10, weight=0.05):
    print(X, lamda, weight)

    return res


zip_lamda = [i for i in range(10)]
x = [i + 10 for i in range(10)]

with tqdm(total=len(x)) as t:
    for i, x in enumerate(pool.imap(F, x, zip_lamda)):
        t.update()

    pool.close()
    pool.join()
Example #18
0
        def main():
            parser = argparse.ArgumentParser()
            parser.add_argument(
                'source_path',
                help="Path to the video or audio file to subtitle",
                nargs='?')
            parser.add_argument(
                '-C',
                '--concurrency',
                help="Number of concurrent API requests to make",
                type=int,
                default=10)
            parser.add_argument(
                '-o',
                '--output',
                help=
                "Output path for subtitles (by default, subtitles are saved in \ the same directory and name as the source path)"
            )
            parser.add_argument('-F',
                                '--format',
                                help="Destination subtitle format",
                                default="srt")
            parser.add_argument('-S',
                                '--src-language',
                                help="Language spoken in source file",
                                default="en")
            parser.add_argument('-D',
                                '--dst-language',
                                help="Desired language for the subtitles",
                                default="en")
            parser.add_argument(
                '-K',
                '--api-key',
                help=
                "The Google Translate API key to be used. (Required for subtitle translation)"
            )
            parser.add_argument('--list-formats',
                                help="List all available subtitle formats",
                                action='store_true')
            parser.add_argument(
                '--list-languages',
                help="List all available source/destination languages",
                action='store_true')

            if (os.name == "posix"):
                print os.system("uname -a")
            else:
                print "unknown OS"

            args = parser.parse_args()
            # print "arguments",args
            args.source_path = str(self.filename)
            print args.source_path, "SOURCE PATH"
            # print "CONCURRENCY >>>", args.concurrency
            # print args
            path = args.source_path[:-3]
            srt_path = path + "srt"
            print srt_path

            audio_filename, audio_rate = extract_audio(args.source_path)
            regions = find_speech_regions(audio_filename)
            pool = ProcessingPool(args.concurrency)
            converter = FLACConverter(source_path=audio_filename)

            transcripts = []
            if regions:
                try:
                    widgets = [
                        "Converting speech regions to FLAC files: ",
                        Percentage(), ' ',
                        Bar(), ' ',
                        ETA()
                    ]
                    pbar = ProgressBar(widgets=widgets,
                                       maxval=len(regions)).start()
                    extracted_regions = []
                    for i, extracted_region in enumerate(
                            pool.imap(converter, regions)):
                        extracted_regions.append(extracted_region)
                        pbar.update(i)
                        self.progress1.setValue(i)
                    pbar.finish()

                except KeyboardInterrupt:
                    pbar.finish()
                    pool.terminate()
                    pool.join()
                    print "Cancelling transcription"
                    return 1

            os.remove(audio_filename)

            return 0
            print "... trying again ..."
            sleep(1.5**wait)
            wait += 1    
    if count > 0:
        t = r['titleData']
        return {day : Set([x['value'] for x in t])}
    else:
        return {day : Set()}

start_date = date(1880,1,1)
end_date = date(1941,1,1)
date_list = [str(date) for date in perdelta(start_date, end_date, timedelta(days=1))]


pool = Pool(10)
result_iter = pool.imap(scrape_day, date_list)
title_sets = {}
for result in result_iter:
    title_sets.update(result)

###################################
#Make dictionary of daily matches #
###################################
def scrape_paper(title_id):
    title_url = "_".join((title_stub_url, title_id))
    wait=0
    while True:
        try:
            title_get = session.get(title_url,  cookies=session.cookies, allow_redirects=True).text
            break
        except:
    line['archive'] = "americas_historical_newspapers"
    line['publication_title'] = title.input.text
    line['publication_id'] = title.input['value']
    city = title.find('td', class_="ci").text
    state = title.find('td', class_="st").text
    line['location'] = ", ".join((city,state))
    line['lastUpdated'] = search_datetime
    papers_data.append(line)


#Scrape publication dates in parallel
pub_ids = [paper['publication_id'] for paper in papers_data]

print "Scraping papers..."
pool = Pool(10)
result_iter = pool.imap(scrape_paper, pub_ids, [date_list_str]*len(pub_ids))
title_sets = {}
for result in result_iter:
    title_sets.update(result)

#Create file#
filename = "americas_historical_newspapers-allpubs.csv"
filepath = directory
fields = ['archive', 'publication_title', 'publication_id', 'location', 'lastUpdated'] + date_list_str

#Create output
print "Creating data rows..."
out = []
for paper in papers_data:
    title_id = paper['publication_id']
    date_match = {k : int(k in title_sets[title_id]) for k in date_list_str}
Example #21
0
  def train(self,simulation, epochs=100, num_throws = 1000, training_file = None, learning_rate=1e-4):
    '''
    Train the pitcher's aim model for a given simulation.
    '''
    # generate training data.
    # simulation inputs:
    #  - pitch type
    #  - effort
    #  - verticle deflection
    #  - horizontal deflection
    # simulation outputs:
    #  - verticle location
    #  - horizontal location

    if self.aim_model is None:
      raise Exception("Error: Pitcher's aim model has not been initialized. Cannot train.")

    N = num_throws
    if training_file is None:
      training_file = f'pitcher-training-data-{num_throws}-{self.id()}.pl'
    if isinstance(training_file,str):
      training_file = pathlib.Path(training_file).resolve()


    if training_file.is_file():
      print(f"Traning data file found ({str(training_file)}). Loading training data from file.")
      data = torch.load(str(training_file))
      sim_inputs = data['i']
      sim_outputs = data['o']
    else:
      print(f"No training data file found ({str(training_file)}). Created training dataset now")

      pint.set_application_registry(ureg)
      pool = Pool()


      sim_inputs  = dict()
      sim_inputs['type'] = torch.empty([N],dtype=int)
      sim_inputs['effort'] = torch.empty([N],dtype=ScalarType)
      sim_inputs['verticle_deflection'] = torch.empty([N],dtype=ScalarType)
      sim_inputs['horizontal_deflection'] = torch.empty([N],dtype=ScalarType)

      sim_outputs = dict()
      sim_outputs['verticle_location'] = torch.empty([N],dtype=ScalarType)
      sim_outputs['horizontal_location'] = torch.empty([N],dtype=ScalarType)

      configs = list()
      print("Generating training configurations.")
      for i in tqdm.tqdm(range(N)):
        type = numpy.random.choice(list(self.characteristics['pitches'].keys()))
        effort = numpy.random.uniform(75,105)
        verticle_deflection = numpy.random.normal(loc=0,scale=5)
        horizontal_deflection = numpy.random.normal(loc=0,scale=5)
        sim_inputs['type'][i] = int(type)
        sim_inputs['effort'][i] = float(effort)
        sim_inputs['verticle_deflection'][i] = float(verticle_deflection)
        sim_inputs['horizontal_deflection'][i] = float(horizontal_deflection)

        configs.append(self.configure_throw_from_deflection( type, Q_(effort,'percent'), Q_(verticle_deflection,'degree'), Q_(horizontal_deflection,'degree')))
     
      def terminate(record):
        if record[-1][2] < 0:
          return True
        if record[-1][0] > 2:
          return True
        return False

      def run_config(config):
        return simulation.run( config, terminate, record_all=False )

      # estimate the runtime
      # print("Estimating runtime to generate training data...")
      # i = 0
      # def run():
        # nonlocal i
        # run_config(configs[i%N])
        # i = i+1
      # runtime = timeit.timeit(run,number=10)/10
      # print(f"  will take approximately {runtime*N/pool.nodes} s to run {N} simulations @ {runtime} s / run on {pool.nodes} CPUs.")

      runs = list(tqdm.tqdm(pool.imap( run_config, configs), total=N))

      for i in range(N):
        sim_outputs['horizontal_location'][i] = runs[i][0][1]
        sim_outputs['verticle_location'][i] = runs[i][0][3]

      torch.save( {'i':sim_inputs,'o':sim_outputs}, str(training_file) )
      

    model_inputs  = torch.empty( [N,self.aim_model.in_features],dtype=ScalarType )
    model_outputs = torch.empty( [N,self.aim_model.out_features],dtype=ScalarType )

    for i in range(N):
      model_inputs[i,:] = self.aim_model.make_feature_vector( pitch_type=sim_inputs['type'][i],
                                                              effort=sim_inputs['effort'][i],
                                                              verticle_location=sim_outputs['verticle_location'][i],
                                                              horizontal_location=sim_outputs['horizontal_location'][i] )
    

      model_outputs[i,:] = self.aim_model.make_output_vector( verticle_deflection=sim_inputs['verticle_deflection'][i],
                                               horizontal_deflection=sim_inputs['horizontal_deflection'][i] )

    
    # optimizer = torch.optim.Adam( self.aim_model.parameters(), lr=1e-2 )
    optimizer = torch.optim.SGD( self.aim_model.parameters(), lr=learning_rate )
    loss_func = torch.nn.MSELoss()


    losses = list()
    print(f"Traning model:")
    for i in tqdm.tqdm(range(epochs)):
      optimizer.zero_grad()
      pred = self.aim_model(model_inputs)
      loss = loss_func(pred,model_outputs)
      losses.append(float(loss))
      loss.backward()
      optimizer.step()


    return losses
Example #22
0
#!/usr/bin/env python
#
# Author: Mike McKerns (mmckerns @caltech and @uqfoundation)
# Copyright (c) 1997-2014 California Institute of Technology.
# License: 3-clause BSD.  The full license text is available at:
#  - http://trac.mystic.cacr.caltech.edu/project/pathos/browser/pathos/LICENSE

# instantiate and configure the worker pool
from pathos.multiprocessing import ProcessingPool
pool = ProcessingPool(nodes=4)

_result = map(pow, [1,2,3,4], [5,6,7,8]) 

# do a blocking map on the chosen function
result = pool.map(pow, [1,2,3,4], [5,6,7,8])
assert result == _result

# do a non-blocking map, then extract the result from the iterator
result_iter = pool.imap(pow, [1,2,3,4], [5,6,7,8])
result = list(result_iter)
assert result == _result

# do an asynchronous map, then get the results
result_queue = pool.amap(pow, [1,2,3,4], [5,6,7,8])
result = result_queue.get()
assert result == _result

def newspapers_com_scraper(search_terms, start_date, end_date, filepath):
    
    #Set starting values
    #Set URLs
    signin_url = "https://www.newspapers.com/signon.php"
    search_url = "http://www.newspapers.com/search/aj_getresults"
    search_content_url = "http://www.newspapers.com/search/aj_getsearchrecord"

    

    #Scraper Functions
    #Define date generator
    def perdelta(start, end, delta):
        curr = start
        while curr < end:
            yield curr
            curr += delta

    #Make search query
    def make_search_query(search_terms, search_date, count):
        query_terms = {"terms":[{"type":"keyword","values":{"value":search_terms}},{"type":"date","values":{"name":"year_month_day","value":str(search_date),"showMissing":"true"}}, {"type":"field", "values":{"name":"place","value":"United States of America"}}]}
        query_form = {"query_terms":dumps(query_terms), "start":0, "count":count, "engine":"solr", "sort":"score desc"}
        return query_form

    #Create record dictionary for content search
    def make_record_dict(records):
        out = []
        for record in records:
            temp = {}
            temp['records'] = [record['records'][0]]
            temp['rollup'] = record['id']
            out.append(temp)
        return out

    def get_content(records):
        groups = 1
        while True:
            try:
                records_list = []
                for group in range(groups):
                    records_list.append(records[group::groups])
                articles = []
                for records_group in records_list:
                    record_dict = make_record_dict(records_group)
                    content_query_form = {'records':dumps(record_dict), 'highlight_terms':search_terms.replace(" ", "|"), 'nonKeywordView' : 'false'}
                    
                    wait = 0
                    while True:
                        try:
                            content = session.post(search_content_url, data = content_query_form, cookies=session.cookies, allow_redirects=True, headers={'referer' : 'http://www.newspapers.com/search/'}, timeout=(1,60)).text
                            break
                        except:
                            print "... trying again ..."
                            sleep(1.5**wait)
                            wait += 1
                    articles = articles + json.loads(content)['records']
                break
            except ValueError:
                groups += 1
        return articles

    #Get article attributes
    def get_from_object(obj, *keys):
        try:
            value = obj
            for k in keys:
                if isinstance(value, dict):
                    value = value.get(k)
                elif isinstance(value, list) and len(value)>1:
                    value = (item for item in value if item['name'] == k).next()['value']
                elif isinstance(value, list) and len(value)==1:
                    value = value[k]
            return value
        except:
            return ''

    #Extract article data
    def get_article_data(record, search_date):
        line = {}
        line['archive'] = 'newspapers_com'
        line['publication_id'] = get_from_object(record, 'rec', 'cover', 'publicationId')
        line['publication_title'] = get_from_object(record, 'rec', 'pubMetadata', 'publication-title')
        line['search_date'] = search_date
        line['page'] = get_from_object(record, 'rec', 'cover', 'title')
        line['href'] = "http://www.newspapers.com/image/" + str(record['rec']['cover']['id']) + "/?terms=" + record['terms']
        line['search_terms'] = search_terms
        return line

    #Scrape function
    def scrape(search_terms, day):
        sleep(1)
        print day

        #Create search query
        query_form = make_search_query(search_terms, day, 1000)

        #POST search query
        wait = 0
        while True:
            try:
                matches = session.post(search_url, data = query_form, cookies=session.cookies, allow_redirects=True, headers={'referer' : 'http://www.newspapers.com/search/'}, timeout=(1,60)).text
                break
            except:
                print "... trying again ..."
                sleep(1.5**wait)
                wait += 1

        #Create search content query
        results = json.loads(matches)
        if results['recCount'] > 0:
            #records = make_record_dict(results['records'])
            #print "Made "
            
            #Get articles
            articles = get_content(results['records'])

            lines = []
            for article in articles:
                lines.append(get_article_data(article, day))

            return lines
        else:
            return None

    #Complete Scraper
    date_list = [str(date) for date in perdelta(start_date, end_date, timedelta(days=1))]
    #Start session
    session = requests.session()

    #Log in
    signin = session.get(signin_url)
    doc = lxml.html.fromstring(signin.text)
    signin_form = doc.forms[0]
    signin_form.fields['username'] = "******"
    signin_form.fields['password'] = "******"
    session.post(signin_url, data=signin_form.form_values(), allow_redirects=True)

    #Create CSV
    #Create file name
    timeperiod = str(start_date) + "to" + str(end_date - timedelta(days=1))
    filename = "newspapers_com-" + timeperiod + ".csv"
    fields = ["archive", "publication_title", "publication_id", "search_date", "page", "href", "search_terms"]
    
    pool = Pool(10)
    results_iter = pool.imap(scrape, [search_terms]*len(date_list), date_list)

    with open("/".join((filepath,filename)), "w") as w:
        writer = csv.DictWriter(w, fieldnames=fields)
        writer.writeheader()
        #Loop over days
        for results in results_iter:
            if results != None:
                writer.writerows(results)
    state = re.search("^[^(--)]+(?=--)", str(location_raw)).group(0)
    line['location'] = ", ".join((city, state))
    line['lastUpdated'] = search_datetime
    #Get paper publication dates
    paper_date_set = Set([x['date_issued'] for x in paper_data['issues']])
    date_match = {k : int(k in paper_date_set) for k in date_list}
    line.update(date_match)
    return line




#Scrape publication data
print "Getting publication data..."
pool = Pool(10)
result_iter = pool.imap(scrape_paper, paper_stubs, [date_list]*len(paper_stubs))
lines = []
for result in result_iter:
    lines.append(result)

#Prepare for write
filename = "chronicling_america-allpubs.csv"
filepath = directory
fields = ['archive', 'publication_title', 'publication_id', 'location', 'lastUpdated'] + date_list

print "Creating data rows..."
out = []
for line in lines:
    line['publication_title'] = line['publication_title'].encode('utf8')
    line['location'] = line['location'].encode('utf8')
    out.append([line[k] for k in fields])
Example #25
0
# instantiate and configure the worker pool
from pathos.multiprocessing import ProcessingPool
pool = ProcessingPool(nodes=4)

_result = map(pow, [1,2,3,4], [5,6,7,8]) 

# do a blocking map on the chosen function
result = pool.map(pow, [1,2,3,4], [5,6,7,8])
assert result == _result

# do a non-blocking map, then extract the result from the iterator
result_iter = pool.imap(pow, [1,2,3,4], [5,6,7,8])
result = list(result_iter)
assert result == _result

# do an asynchronous map, then get the results
result_queue = pool.amap(pow, [1,2,3,4], [5,6,7,8])
result = result_queue.get()
assert result == _result

Example #26
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description=
        "Preprocess audio and then extract features (See detail in tensorflow_tts/bin/preprocess.py)."
    )
    parser.add_argument("--rootdir",
                        default=None,
                        type=str,
                        required=True,
                        help="root path.")
    parser.add_argument("--outdir",
                        default=None,
                        type=str,
                        required=True,
                        help="output dir.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="yaml format configuration file.")
    parser.add_argument("--n_cpus",
                        type=int,
                        default=4,
                        required=False,
                        help="yaml format configuration file.")
    parser.add_argument("--test_size",
                        type=float,
                        default=0.05,
                        required=False,
                        help="yaml format configuration file.")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('Skip DEBUG/INFO messages')

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    processor = LJSpeechProcessor(root_path=args.rootdir,
                                  cleaner_names="english_cleaners")

    # check directly existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir, exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'valid'), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'valid', 'raw-feats'),
                    exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'valid', 'wavs'), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'valid', 'ids'), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'valid', 'raw-f0'),
                    exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'valid', 'raw-energies'),
                    exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'train'), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'train', 'raw-feats'),
                    exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'train', 'wavs'), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'train', 'ids'), exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'train', 'raw-f0'),
                    exist_ok=True)
        os.makedirs(os.path.join(args.outdir, 'train', 'raw-energies'),
                    exist_ok=True)

    # train test split
    idx_train, idx_valid = train_test_split(range(len(processor.items)),
                                            shuffle=True,
                                            test_size=args.test_size,
                                            random_state=42)

    # train/valid utt_ids
    train_utt_ids = []
    valid_utt_ids = []

    for idx in range(len(processor.items)):
        utt_ids = processor.get_one_sample(idx)["utt_id"]
        if idx in idx_train:
            train_utt_ids.append(utt_ids)
        elif idx in idx_valid:
            valid_utt_ids.append(utt_ids)

    # save train and valid utt_ids to track later.
    np.save(os.path.join(args.outdir, "train_utt_ids.npy"), train_utt_ids)
    np.save(os.path.join(args.outdir, "valid_utt_ids.npy"), valid_utt_ids)

    # process each data
    def save_to_file(idx):
        sample = processor.get_one_sample(idx)

        # get info from sample.
        audio = sample["audio"]
        text_ids = sample["text_ids"]
        utt_id = sample["utt_id"]
        rate = sample["rate"]

        # check
        assert len(audio.shape) == 1, \
            f"{utt_id} seems to be multi-channel signal."
        assert np.abs(audio).max() <= 1.0, \
            f"{utt_id} seems to be different from 16 bit PCM."
        assert rate == config["sampling_rate"], \
            f"{utt_id} seems to have a different sampling rate."

        # trim silence
        if config["trim_silence"]:
            audio, _ = librosa.effects.trim(
                audio,
                top_db=config["trim_threshold_in_db"],
                frame_length=config["trim_frame_size"],
                hop_length=config["trim_hop_size"])

        if "sampling_rate_for_feats" not in config:
            x = audio
            sampling_rate = config["sampling_rate"]
            hop_size = config["hop_size"]
        else:
            x = librosa.resample(audio, rate,
                                 config["sampling_rate_for_feats"])
            sampling_rate = config["sampling_rate_for_feats"]
            assert config["hop_size"] * config["sampling_rate_for_feats"] % rate == 0, \
                "hop_size must be int value. please check sampling_rate_for_feats is correct."
            hop_size = config["hop_size"] * config[
                "sampling_rate_for_feats"] // rate

        # extract feature
        mel, x_stft = logmelfilterbank(x,
                                       sampling_rate=sampling_rate,
                                       hop_size=hop_size,
                                       fft_size=config["fft_size"],
                                       win_length=config["win_length"],
                                       window=config["window"],
                                       num_mels=config["num_mels"],
                                       fmin=config["fmin"],
                                       fmax=config["fmax"])

        # make sure the audio length and feature length
        audio = np.pad(audio, (0, config["fft_size"]), mode='edge')
        audio = audio[:len(mel) * config["hop_size"]]

        # extract raw pitch
        f0, _ = pw.dio(x.astype(np.double),
                       fs=config["sampling_rate"],
                       f0_ceil=config["fmax"],
                       frame_period=1000 * config["hop_size"] /
                       config["sampling_rate"])

        if len(f0) >= len(mel):
            f0 = f0[:len(mel)]
        else:
            f0 = np.pad(f0, ((0, len(mel) - len(f0))))

        # extract energy
        S = librosa.magphase(x_stft)[0]
        energy = np.sqrt(np.sum(S**2, axis=0))

        assert len(mel) * config["hop_size"] == len(audio)
        assert len(mel) == len(f0) == len(energy)

        # apply global gain
        if config["global_gain_scale"] > 0.0:
            audio *= config["global_gain_scale"]
        if np.abs(audio).max() >= 1.0:
            logging.warn(f"{utt_id} causes clipping. "
                         f"it is better to re-consider global gain scale.")

        # save
        if config["format"] == "npy":
            if idx in idx_train:
                subdir = 'train'
            elif idx in idx_valid:
                subdir = 'valid'

            np.save(os.path.join(args.outdir, subdir, "wavs",
                                 f"{utt_id}-wave.npy"),
                    audio.astype(np.float32),
                    allow_pickle=False)
            np.save(os.path.join(args.outdir, subdir, "raw-feats",
                                 f"{utt_id}-raw-feats.npy"),
                    mel.astype(np.float32),
                    allow_pickle=False)
            np.save(os.path.join(args.outdir, subdir, "ids",
                                 f"{utt_id}-ids.npy"),
                    text_ids.astype(np.int32),
                    allow_pickle=False)
            np.save(os.path.join(args.outdir, subdir, "raw-f0",
                                 f"{utt_id}-raw-f0.npy"),
                    f0.astype(np.float32),
                    allow_pickle=False)
            np.save(os.path.join(args.outdir, subdir, "raw-energies",
                                 f"{utt_id}-raw-energy.npy"),
                    energy.astype(np.float32),
                    allow_pickle=False)
        else:
            raise ValueError("support only npy format.")

    # apply multi-processing Pool
    p = Pool(nodes=args.n_cpus)
    work = tqdm(range(len(processor.items)), desc="[Preprocessing]")
    list(p.imap(save_to_file, work))
	try:
		get_states = requests.get(nation_url, timeout=(1,60)).text
		break
	except:
		sleep(1.5**wait)
		wait += 1

parsed = BeautifulSoup(get_states, 'html.parser')
state_urls = [a['href'] for a in parsed.find('div', class_='newLocUSListArea').find_all('a')]

################
#Get town links#
################
print "Getting town URLs..."
pool = Pool(10)
result_iter = pool.imap(get_town_urls, state_urls)

town_urls = []
for result in result_iter:
	town_urls += result

#Clean up town URLs
town_urls = [re.sub("st\.-","st-",url) for url in town_urls]

#################
#Get paper links#
#################
print "Getting paper URLs..."
result_iter = pool.imap(get_paper_urls, town_urls)

paper_urls = []
Example #28
0
    def RunCutplan(self, timer=True):
        # initialisations
        id = []
        t = time()
        start_id = 0
        if id == []:
            start_id = sum(self.completed)
            id = list(range(start_id, self.Cutplans.shape[0]))
        numproc = cpu_count() - 2
        p = Pool(processes=numproc)
        # cpSched = self.Cutplans.iloc[id]

        # NumLogs = min(
        #     [self.Cutplans.LogCount[id]*1.5, [1000]*len(id)], axis=0)
        iterLog = []
        iterC = []
        lens = []
        descs = []
        rs = []
        for cID in id:
            c = self.Cutplans.iloc[cID]
            # find desc to be used to open the correct log data file
            desc = c.Description[2:4]+"-"+str(int(c.Description[5:7])-1)
            descs.append(desc)
            # get data from log data file
            FullLD = read_csv(self.logPath+desc+'.csv')
            # total = NumLogs[id.index(cID)]
            # lMax = int(min([total, FullLD.shape[0]]))
            # lID = randint(0, FullLD.shape[0]-lMax)
            # LD =
            # FullLD.iloc[lID:lID+lMax].reset_index().drop('index', axis=1)
            lens.append(FullLD.shape[0])

            # Set up lists for multiprocessing
            for i in range(FullLD.shape[0]):
                log = FullLD.iloc[i]
                iterLog.append(log)
                iterC.append(c)
                descs.append(desc)
                rs.append(Recovery(c))

# =============================================================================
#             completed = []
#             for lID in range(len(iterLog)):
#                 log = iterLog[lID]
#                 coords = GetLogCoords(log, c)
#                 completed.append(coords)
#                 Timer(id, cID, lID, time()-t)
# =============================================================================
        # if id.index(cID) > 0:
        #     p.restart()
        data = []
        data = p.imap(GetLogCoords, iterLog, iterC)
        completed = []
        i = 0
        j = 0
        LogRecoveries = self.CreateRecoveriesDF(lens[i])
        while j < len(iterLog):
            # try:
            j += 1
            res = next(iter(data))
            completed.append(res)
            if self.abort:
                self.abort = False
                return
            if timer:
                cur = j - sum(lens[0:i])
                rs[j-1].RunRecovery(res)
                self.LogTransfer(iterLog[j-1], LogRecoveries, cur-1)
                self.RecoveryTransfer(
                    rs[j-1], LogRecoveries, cur-1, descs[j-1])
                Timer((i+1, len(id)), j, len(iterLog), time()-t)
                self.l_progress.emit(cur/lens[i])
                if cur == lens[i]:
                    self.completed[cID] = True
                    self.cp_progress.emit(i)
                    self.Recoveries[i] = DataFrame(LogRecoveries)
                    i += 1
                    if i < len(lens):
                        LogRecoveries = self.CreateRecoveriesDF(lens[i])
            # except BaseException:
            #     print("fail")
            #     break
        # self.finishedSim.emit()
        # start = 0
        # for l in lens:
        #     LogRecoveries = self.CreateRecoveriesDF(l)
        #     for i in range(l):
        #         self.LogTransfer(iterLog[i], LogRecoveries, i)
        #         self.RecoveryTransfer(
        #             rs[start+i], LogRecoveries, lID, descs[start+i])
        #     self.Recoveries[cID] = DataFrame(LogRecoveries)
        #     start += l
        self.finished.emit()
Example #29
0
        def main():
            parser = argparse.ArgumentParser()
            parser.add_argument(
                'source_path',
                help="Path to the video or audio file to subtitle",
                nargs='?')
            parser.add_argument(
                '-C',
                '--concurrency',
                help="Number of concurrent API requests to make",
                type=int,
                default=10)
            parser.add_argument(
                '-o',
                '--output',
                help=
                "Output path for subtitles (by default, subtitles are saved in \ the same directory and name as the source path)"
            )
            parser.add_argument('-F',
                                '--format',
                                help="Destination subtitle format",
                                default="srt")
            parser.add_argument('-S',
                                '--src-language',
                                help="Language spoken in source file",
                                default="en")
            parser.add_argument('-D',
                                '--dst-language',
                                help="Desired language for the subtitles",
                                default="en")
            parser.add_argument(
                '-K',
                '--api-key',
                help=
                "The Google Translate API key to be used. (Required for subtitle translation)"
            )
            parser.add_argument('--list-formats',
                                help="List all available subtitle formats",
                                action='store_true')
            parser.add_argument(
                '--list-languages',
                help="List all available source/destination languages",
                action='store_true')

            args = parser.parse_args()
            print args

            if (os.name == "posix"):
                args.source_path = str(self.filename)
            else:
                args.source_path = (str(self.filename)).replace("/", "\\")
                pas = (args.source_path).replace("/", "\\")
                args.source_path = pas
                print " Printing pas >>>", pas
            print args

            path = args.source_path[:-3]
            srt_path = path + "srt"

            if args.list_formats:
                print("List of formats:")
                for subtitle_format in FORMATTERS.keys():
                    print("{format}".format(format=subtitle_format))
                return 0

            if args.list_languages:
                print("List of all languages:")
                for code, language in sorted(LANGUAGE_CODES.items()):
                    print("{code}\t{language}".format(code=code,
                                                      language=languages))
                return 0

            if args.format not in FORMATTERS.keys():
                print(
                    "Subtitle format not supported. Run with --list-formats to see all supported formats."
                )
                return 1

            if args.src_language not in LANGUAGE_CODES.keys():
                print(
                    "Source language not supported. Run with --list-languages to see all supported languages."
                )
                return 1

            if args.dst_language not in LANGUAGE_CODES.keys():
                print(
                    "Destination language not supported. Run with --list-languages to see all supported languages."
                )
                return 1

            if not args.source_path:
                print("Error: You need to specify a source path.")
                return 1

            audio_filename, audio_rate = extract_audio(args.source_path)

            regions = find_speech_regions(audio_filename)
            pool = ProcessingPool(args.concurrency)
            converter = FLACConverter(source_path=audio_filename)
            recognizer = SpeechRecognizer(language=args.src_language,
                                          rate=audio_rate,
                                          api_key=GOOGLE_SPEECH_API_KEY)

            transcripts = []
            if regions:
                try:
                    widgets = [
                        "Converting speech regions to FLAC files: ",
                        Percentage(), ' ',
                        Bar(), ' ',
                        ETA()
                    ]
                    pbar = ProgressBar(widgets=widgets,
                                       maxval=len(regions)).start()
                    extracted_regions = []
                    for i, extracted_region in enumerate(
                            pool.imap(converter, regions)):
                        extracted_regions.append(extracted_region)
                        pbar.update(i)
                        self.progress1.setValue(i)
                    pbar.finish()

                    widgets = [
                        "Performing speech recognition: ",
                        Percentage(), ' ',
                        Bar(), ' ',
                        ETA()
                    ]
                    pbar = ProgressBar(widgets=widgets,
                                       maxval=len(regions)).start()

                    for i, transcript in enumerate(
                            pool.imap(recognizer, extracted_regions)):
                        transcripts.append(transcript)
                        pbar.update(i)
                        self.progress2.setValue(i)
                    pbar.finish()
                    QMessageBox.about(self, "Subtitles created",
                                      "Created at " + srt_path)
                    if not is_same_language(args.src_language,
                                            args.dst_language):
                        if args.api_key:
                            google_translate_api_key = args.api_key
                            translator = Translator(args.dst_language,
                                                    google_translate_api_key,
                                                    dst=args.dst_language,
                                                    src=args.src_language)
                            prompt = "Translating from {0} to {1}: ".format(
                                args.src_language, args.dst_language)
                            widgets = [
                                prompt,
                                Percentage(), ' ',
                                Bar(), ' ',
                                ETA()
                            ]
                            pbar = ProgressBar(widgets=widgets,
                                               maxval=len(regions)).start()
                            translated_transcripts = []
                            for i, transcript in enumerate(
                                    pool.imap(translator, transcripts)):
                                translated_transcripts.append(transcript)
                                pbar.update(i)
                                self.progress2.setValue(i)
                            pbar.finish()
                            transcripts = translated_transcripts
                        else:
                            print "Error: Subtitle translation requires specified Google Translate API key. \See --help for further information."
                            return 1

                except KeyboardInterrupt:
                    pbar.finish()
                    pool.terminate()
                    pool.join()
                    print "Cancelling transcription"
                    return 1

            timed_subtitles = [(r, t) for r, t in zip(regions, transcripts)
                               if t]
            formatter = FORMATTERS.get(args.format)
            formatted_subtitles = formatter(timed_subtitles)

            dest = args.output

            if not dest:
                base, ext = os.path.splitext(args.source_path)
                dest = "{base}.{format}".format(base=base, format=args.format)

            with open(dest, 'wb') as f:
                f.write(formatted_subtitles.encode("utf-8"))

            print "Subtitles file created at {}".format(dest)

            os.remove(audio_filename)

            return 0
Example #30
0
class analyze(setup.setup):
    def __init__(self, args, logging_level=logging.INFO):

        super(analyze, self).__init__(args, logging_level)

    # set up processing pool and run all analyses specified in args
    def run(self):

        if self.args.jumpdists:
            n_bins = 100.
            bin_width = 1 / n_bins
            bins = np.arange(0, 1 + bin_width, 1 / n_bins)

            if self.args.file:
                user, vals = self.artist_jump_distributions(self.args.file,
                                                            bins=bins,
                                                            self_jumps=False)
                with open(self.args.resultdir + user, 'w') as fout:
                    fout.write(','.join(vals.astype(str)) + '\n')

            else:
                raise ('not implemented!')
                self.pool = Pool(self.args.n)
                self.rootLogger.info("Pool started")

                self.rootLogger.info("Starting jump distance analysis")

                func_partial = partial(self.artist_jump_distributions,
                                       bins=bins,
                                       self_jumps=False)
                with open(self.args.resultdir + 'jumpdists', 'w') as fout:
                    for user, vals in self.pool.imap(func_partial,
                                                     self.listen_files):
                        fout.write(user + '\t' + ','.join(vals.astype(str)) +
                                   '\n')

                self.pool.close()
                self.rootLogger.info("Pool closed")

        if self.args.blockdists:
            #self.rootLogger.info("Starting block distance analysis")
            self.mean_block_distances(self.args.file)

        if self.args.diversity_dists:
            bins = np.arange(0, 1.01, .01)
            self.diversity_distributions(self.args.file, bins=bins)

        if self.args.clustering:
            self.clustering(self.args.file)

        if self.args.values:
            self.patch_values(self.args.file)

        if self.args.exp:
            self.explore_exploit(self.args.file)

        if self.args.patch_len_dists:
            self.patch_len_dists(self.args.file)

    # calculate distribution (using histogram with specified bins)
    # of sequential artist-to-artist distances
    def artist_jump_distributions(self, fi, bins, self_jumps=False):
        user = fi.split('/')[-1][:-4]
        df = pd.read_pickle(fi)
        if self_jumps:
            vals = np.histogram(df['dist'].dropna(), bins=bins)[0]
        else:
            vals = np.histogram(df['dist'][df['dist'] > 0], bins=bins)[0]
        self.rootLogger.info(
            'artist jump distances done for user {} ({})'.format(user, fi))
        return user, vals

    # calculate distribution (using histogram with specified bins)
    # of patch diversity for each user

    # awk 'FNR==1' * > diversity_dists_zeros
    # awk 'FNR==2' * > diversity_dists_nozeros
    def diversity_distributions(self, fi, bins):
        if 'patches' not in fi:
            raise ('WRONG DATATYPE')
        user = fi.split('/')[-1].split('_')[0]
        df = pd.read_pickle(fi).dropna(subset=['diversity'])
        zeros = np.histogram(df[df['n'] >= 5]['diversity'], bins=bins)[0]
        nozeros = np.histogram(df[(df['n'] >= 5)
                                  & (df['diversity'] > 0)]['diversity'],
                               bins=bins)[0]

        zeros = zeros / float(zeros.sum())
        nozeros = nozeros / float(nozeros.sum())

        with open(self.args.resultdir + user, 'w') as fout:
            fout.write(user + '\t' + 'zeros' + '\t' +
                       ','.join(zeros.astype(str)) + '\n')
            fout.write(user + '\t' + 'nozeros' + '\t' +
                       ','.join(nozeros.astype(str)) + '\n')
        self.rootLogger.info(
            'diversity distributions done for user {} ({})'.format(user, fi))

    def mean_block_distances(self, fi, n=100):
        def cos_nan(arr1, arr2):
            if np.any(np.isnan(arr1)) or np.any(np.isnan(arr2)):
                return np.nan
            else:
                return cosine(arr1, arr2)

        user = fi.split('/')[-1].split('_')[0]
        df = pd.read_pickle(fi)
        blocks = df[df['n'] >= 5].dropna()

        result = []
        for i in xrange(len(blocks) - n):
            first = blocks['centroid'].iloc[i]
            result.append(
                np.array(blocks['centroid'][i + 1:i + n + 1].apply(
                    lambda val: cos_nan(val, first))))
        result = np.nanmean(np.vstack(result), 0)

        with open(self.args.resultdir + user, 'w') as fout:
            fout.write(
                '\t'.join([user, 'patch', ','.join(result.astype(str))]) +
                '\n')

        self.rootLogger.info(
            'Block distances for user {} processed successfully ({})'.format(
                user, fi))

        # now shuffled
        # idx = np.array(blocks.index)
        # np.random.shuffle(idx)
        # blocks = blocks.reindex(idx)

        # result_random = []
        # for i in xrange(len(blocks)-n):
        #     first = blocks['centroid'].iloc[i]
        #     result_random.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first))))
        # result_random = np.nanmean(np.vstack(result_random),0)

        # with open(self.args.resultdir+user,'w') as fout:
        #     fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n')
        #     fout.write('\t'.join([user,'patch_random',','.join(result_random.astype(str))])+'\n')
        # self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi))

    def clustering(self, fi):
        df = pd.read_pickle(fi)
        user = fi.split('/')[-1].split('_')[0]

        mask = (df['centroid'].apply(lambda arr: ~np.any(np.isnan(arr))).values
                ) & (df['n'] >= 5) & (df['diversity'] <= 0.2)
        clust_data = df[mask].reset_index()
        arr = np.vstack(clust_data['centroid'])
        Z = linkage(arr, 'complete')
        clusters = fcluster(Z, t=0.2, criterion='distance')
        assignments = np.repeat(np.nan, len(df))
        assignments[np.where(mask)] = clusters
        df['patch_clust'] = assignments
        df.to_pickle('{}{}.pkl'.format(self.args.resultdir, user))
        self.rootLogger.info(
            'Patch clusters for user {} processed successfully ({})'.format(
                user, fi))

    def patch_len_dists(self, fi):
        df = pd.read_pickle(fi)
        user = fi.split('/')[-1][:-4]

        explore = df[np.isnan(df['patch_clust'])]
        result_explore = explore['n'].value_counts()

        df['explore'] = np.isnan(df['patch_clust']).astype(int)
        df['explore-idx'] = df['explore'].cumsum()

        result_exploit = df.groupby('explore-idx').apply(
            lambda df: df.dropna()['n'].sum()).value_counts()

        result_explore = result_explore.reindex(xrange(
            1,
            max(result_explore.index) + 1),
                                                fill_value=0.).values
        result_exploit = result_exploit.reindex(xrange(
            1,
            max(result_exploit.index) + 1),
                                                fill_value=0.).values

        result_explore = sparse.csr_matrix(result_explore)
        result_exploit = sparse.csr_matrix(result_exploit)

        with open(self.args.resultdir + user, 'w') as fout:
            fout.write(user + '\t' + 'explore' + '\t' + ':'.join([
                ','.join(a.astype(str)) for a in result_explore.data,
                result_explore.indices, result_explore.indptr
            ]) + '\n')
            fout.write(user + '\t' + 'exploit' + '\t' + ':'.join([
                ','.join(a.astype(str)) for a in result_exploit.data,
                result_exploit.indices, result_exploit.indptr
            ]) + '\n')
        self.rootLogger.info('User {} processed successfully ({})'.format(
            user, fi))

    def explore_exploit(self, fi):

        user = fi.split('/')[-1][:-4]

        df_patches_raw = pd.read_pickle(fi)

        # add time in next bout
        df_patches_raw['next_n'] = df_patches_raw['n'].shift(-1)

        # add patch values
        # listensPerPatch = df_patches_raw.groupby('patch_clust')['n'].sum()
        # overall_prop = listensPerPatch/float(df_patches_raw['n'].sum())
        # overall_prop.name = 'final_value'
        # df_patches_raw = df_patches_raw.join(overall_prop,on='patch_clust')
        """
        # time in next exploit patch as function of exploration time
        result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['next_n'].mean()

        fout.write(user+'\t'+'next-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """
        # total time exploiting as a function of time exploring
        df_patches_raw['explore'] = np.isnan(
            df_patches_raw['patch_clust']).astype(int)
        df_patches_raw['explore-idx'] = df_patches_raw['explore'].cumsum()

        # combine all exploit listens
        #grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'][1:].sum()]}))

        # only last exploit bout
        grp_explore = df_patches_raw.groupby('explore-idx').apply(
            lambda df: pd.DataFrame({
                'n': [df['n'].iloc[0]],
                'n-exploit': [df['n'].iloc[-1]]
            }))

        #result = grp_explore.groupby('n')['n-exploit'].mean()
        #fout.write(user+'\t'+'total-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """
        # exploration time as a function of exploitation time
        grp_exploit = grp_explore.copy()
        grp_exploit['n-explore'] = grp_exploit['n'].shift(-1)

        result = grp_exploit.groupby('n-exploit')['n-explore'].mean()
        fout.write(user+'\t'+'explore-vs-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """

        # prob exploit given explore time - already done

        # explore_only = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])]
        # result = explore_only['n'][:-1].value_counts()
        # arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values
        # final_result = arr/(np.cumsum(arr[::-1])[::-1])
        # final_result = sparse.csr_matrix(final_result)

        # with open(self.args.resultdir+user+'_exploit','w') as fout:
        #     fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n')

        # prob explore given exploit time
        result = grp_explore['n-exploit'][
            grp_explore['n-exploit'] > 0].value_counts()
        arr = result.reindex(xrange(1,
                                    max(result.index) + 1),
                             fill_value=0.).values
        final_result = arr / np.cumsum(arr[::-1])[::-1]
        final_result = sparse.csr_matrix(final_result)

        with open(self.args.resultdir + user + '_explore', 'w') as fout:
            fout.write(user + '\t' + ':'.join([
                ','.join(a.astype(str)) for a in final_result.data,
                final_result.indices, final_result.indptr
            ]) + '\n')

        #fout.write(user+'\t'+'prob-explore-given-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """
        # patch value as a function of exploration time
        df_patches_raw['final_value_next'] = df_patches_raw['final_value'].shift(-1)
        result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['final_value_next'].mean()
        fout.write(user+'\t'+'exploit-value-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n')
        """

        self.rootLogger.info('User {} processed successfully ({})'.format(
            user, fi))