Beispiel #1
0
 def process(self):
     
     try:
         urls = redis_one.hkeys(self.sitemap_prefix)
         ofh = open('test_urls.txt', 'w+')
         urls.sort()
         ofh.write(('\n'.join(urls)).encode('utf8', 'ignore'))
         logger.error('total urls len %s' % len(urls))
         dict_res = defaultdict(int)
         i = 0
         while i <= len(urls):
             pool = Pool(processes=15)
             q = Queue()
             dict_subres = defaultdict(int)
             list_urls = [urls[i + j * 10000:i+(j+1)*10000] for j in range(15)]
             #list_dict_res = list(pool.map_async(parse_content, list_urls))
             for d in pool.imap(parse_content, list_urls):
                 for k, v in d.iteritems():
                     dict_res[k] += v
             logger.error('Parser %s %s' % (len(list_urls), len(dict_res)))
             i += 10000 * 15
         sorted_dict_res = sorted(dict_res.iteritems(), key = lambda s: s[1], reverse=True)
         ofh = open('./test_sitemap_keywords', 'w+')
         ofh.write('\n'.join(['%s\t%s' % (k,v) for (k,v) in sorted_dict_res if v>=3]).encode('utf8', 'ignore'))
         ofh.close()
     except:
         logger.error(traceback.format_exc())
def main():
    idir, ofile, dffile = _parse_cmdline()

    print u'Loading doc-freqs file {}...'.format(dffile)
    with open(dffile, 'rb') as f:
        df = pickle.load(f)    

    print u'Reading input directory: {}'.format(idir)
    jobs = _load_jobs(idir, df)

    # Do the work.
    pool = Pool(4)
    njobs = len(jobs)

    try:
        import sys
        with codecs.open(ofile, 'wb') as pf:
            pickle.dump(njobs, pf)
            results = pool.imap_unordered(worker, jobs)
            for i, result in enumerate(results, 1):
                pickle.dump(result, pf)
                per = 100 * (float(i) / njobs)
                sys.stdout.write(u'\rPercent Complete: {:2.3f}%'.format(per))
                sys.stdout.flush()
            sys.stdout.write(u'\rPercent Complete: 100%    \n')
            sys.stdout.flush()

    except KeyboardInterrupt:
        sys.stdout.write(u'\rPercent Complete: {:2.3f}%    \n'.format(per))
        sys.stdout.write(u'Shutting down.\n')
        sys.stdout.flush()
        sys.exit()

    print u'Complete!'
Beispiel #3
0
class withPool:
    def __init__(self, procs):
        self.p = Pool(procs, init_func)
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.p.close()
Beispiel #4
0
def parse_genetrees(args):
    """parse a set of genetrees in serial or parallel fashion and run through PHYBASE"""
    is_nexus = False
    if args.input_file.endswith('.nex') or args.input_file.endswith('.nexus'):
        is_nexus = True
    chunks = get_genetree_chunks(args, is_nexus)
    print "Cleaning genetrees"
    if args.cores > 1:
        p = Pool(args.cores)
        trees = p.map(clean_genetree_worker, chunks)
    else:
        trees = map(clean_genetree_worker, chunks)
    p.close()
    # get taxa from first tree
    taxa = getTaxa(trees[0])
    # instantiate Phybase instance and analyse trees
    phybase = Phybase()
    star_tree, steac_tree = phybase.run(trees, args.outgroup, taxa)
    template = """#NEXUS\nbegin trees;\ntree 'STAR' = %s\ntree 'STEAC' = %s\nend;""" % (star_tree, steac_tree)
    print template
    star_steac_out = os.path.splitext(args.input_file)[0]
    star_steac_out += '.star_steac.trees'
    star_steac_out = open(star_steac_out, 'w')
    star_steac_out.write(template)
    star_steac_out.close()
Beispiel #5
0
def crawl_recursive_threaded(dirpath, ext):
    from database import indexer
    from database import utils
    from multiprocessing import Pool

    # convert to our infos
    cdir = indexer.DirInfo(dirpath, ext)
    cInfos = indexer.dirs_to_info(cdir.subfolders(), ext)

    # comment if you want a silent indexing
    print(cdir.to_string())

    # recursive pooled call
    # NOTE: child calls must not be pooled
    p = Pool(utils.Settings.config['processes'])
    infos = p.map(crawl_recursive, cInfos)
    p.close()

    # remove hierarchy
    dirInfos = [d for sublist in infos for d in sublist]
    dirInfos.append(cdir)

    print('I was crawling with %d processes' %
          utils.Settings.config['processes'])

    return dirInfos
Beispiel #6
0
  def __decrypt_file(self, private_d, public_n, keys, path_to_file, CRT, k):
    if CRT:
      pool = Pool(processes = k)
      promises = []
    decrpted_data = ''
    with open(path_to_file, 'r') as f:
      encrypted_data = f.read()
      encrypted_data_chunks = list(map(''.join, zip(*[iter(encrypted_data)]*len(str(public_n)))))
      for i in range(len(encrypted_data_chunks)):
        stripped = encrypted_data_chunks[i].lstrip('0')
        if CRT:
          promise = pool.apply_async(self.compute_part_of_message, args=(stripped, keys, i))
          promises.append(promise)
        else:
          decrpted_data += chr(self.__decrypt_message(stripped, private_d, public_n))
    if CRT:
      results = [promise.get() for promise in promises]
      decrypted_sorted = sorted(results, key = lambda x: x[1])
      for data in decrypted_sorted:
        decrpted_data += chr(data[0])

    if CRT:
      pool.close()
    with open(path_to_file + '.dec', 'w') as f:
      f.write(decrpted_data)
    return decrpted_data
Beispiel #7
0
def start_crawlers(connector_class, num_processes=1):
    """
    Starts a spider process for each spider class in the project

    :param num_processes: the number of simultaneous crawling processes
    :param connector_class: the connector class that should be used by the
    spiders
    """
    spider_classes = pyjobs_crawlers.tools.get_spiders_classes()

    if num_processes == 0:
        connector = connector_class()
        with _get_lock('ALL') as acquired:
            if acquired:
                crawl(spider_classes, connector)
            else:
                print("Crawl process of 'ALL' already running")
            return

    # Splits the spider_classes list in x lists of size num_processes
    spider_classes_chunks = list()
    for x in range(0, len(spider_classes), num_processes):
        spider_classes_chunks.append(spider_classes[x:x + num_processes])

    # Start num_processes number of crawling processes
    for spider_classes_chunk in spider_classes_chunks:
        process_params_chunk = [(spider_class, connector_class)
                                for spider_class in spider_classes_chunk]
        p = Pool(len(process_params_chunk))
        p.map(start_crawl_process, process_params_chunk)
def main(world_folder, replacement_file_name):
    global replacements
    world = nbt.world.WorldFolder(world_folder)
    logger = configure_logging()
    logger.info("Starting processing of %s", world_folder)
    if not isinstance(world, nbt.world.AnvilWorldFolder):
        logger.error("%s is not an Anvil world" % (world_folder))
        return 65 # EX_DATAERR
    if replacement_file_name != None:
        logger.info("Using Replacements file: %s", replacement_file_name)
        with open(replacement_file_name, 'r') as replacement_file:
            replacements = json.load(replacement_file)
    # get list of region files, going to pass this into function to process region
    region_files = world.get_regionfiles();
    
    # Parallel
    q = Queue()
    lp = threading.Thread(target=logger_thread, args=[q])
    lp.start()
    p = Pool(initializer=process_init, initargs=[q,replacements], maxtasksperchild=1)
    region_data = p.map(process_region, region_files)
    # Map has finished up, lets close the logging QUEUE
    q.put(None)
    lp.join()
    
    # Not Parallel
#     region_data = map(process_region, region_files)
    
    # Write output data
    write_block_data(region_data,"output.txt")
    return 0
def k_rbm(infile, outfile):
    #dataset
    data = sio.loadmat(infile)['data']

    # reconstruction cost
    cost_dict = {}
    p = Pool(5)
    first_arg = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5"]
    second_arg = data
    a,b,c,d,e = p.map(rbm_star, itertools.izip(first_arg, itertools.repeat(second_arg)))
    # p.map(rbm_star, itertools.izip(first_arg, itertools.repeat(second_arg)))
    # get the costs from the tuples
    cost_1 = a[0]
    cost_2 = b[1]
    cost_3 = c[2]
    cost_4 = d[3]
    cost_5 = e[4]
    # find the cluster assignments
    for i in xrange(len(cost_1)):
        mincost = min(cost_1[i],cost_2[i],cost_3[i],cost_4[i],cost_5[i])
        if mincost == cost_1[i]:
            cost_dict[i+1] = 1
        elif mincost == cost_2[i]:
            cost_dict[i+1] = 2
        elif mincost == cost_3[i]:
            cost_dict[i+1] = 3
        elif mincost == cost_4[i]:
            cost_dict[i+1] = 4
        else:
            cost_dict[i+1] = 5

    # store results
    json.dump(cost_dict, open(outfile, 'w'))
Beispiel #10
0
	def downloadImages(self, dirName, urlData):
		child_folder = 'pictures'
		failures = 0
		dirName = os.path.join(dirName,child_folder)
		process_pool = Pool(processes=self._pool_size)
		results = []

		for ud in urlData:
			abs_img = os.path.join(dirName,urlparse(ud).path.strip('/'))
			try:
				os.makedirs(dirname(abs_img))
			except:
				pass
			results.append( process_pool.apply_async( urllib.urlretrieve, [ ud,  abs_img ] ) )

		self.initialize_bar(max=len(results))
		for result in results:
			try:
				result.get(self._timeout)
			except Exception:
				failures += 1
			else:
				self.update_bar()

		self.finish_bar()
		if failures: print("   Completed with errors: Downloaded {0}/{1}".format(len(results) - failures, len(results)))
		self.finish_bar()
Beispiel #11
0
def rc(rf, alphabet, numOfThreads):
	tryn=0
	counterTmp = 0
	printCounter = 1000
	listBasic = []
	if rf.endswith('.rar'):
		funcChosen = unrar
	elif rf.endswith('.zip') or rf.endswith('.7z') :
		funcChosen = zipFileUnzip
	for a in range(1,len(alphabet)+1):
		for b in itertools.product(alphabet,repeat=a):
			k="".join(b)
			k=re.escape(k)
			listBasic.append(k)
			tryn+=1
			if len(listBasic) == numOfThreads:
				pool = Pool(numOfThreads)
				pool.map_async(funcChosen, listBasic, callback = exitPass)
				pool.close()
				if resultPass:
					timeWasted = time.time()-start
					print 'Found! Password is '+resultPass
					print "It took " +str(round(time.time()-start,3))+" seconds"
					print "Speed: "+str(round(tryn/float(timeWasted),2))+" passwords/sec"
					print "Tried "+str(tryn)+" passwords"
					exit()
				listBasic = []
			counterTmp+=1
			if counterTmp >= printCounter:
				print 'Trying combination number '+str(tryn)+':'+str(k)
				timeWasted = round(time.time()-start,2)
				if timeWasted > 0:
					print "It took already " +str(timeWasted) +" seconds. Speed: "+str(round(tryn/float(timeWasted),2))+" passwords/sec"
				counterTmp=0
Beispiel #12
0
def fetch_imagery(image_locations, local_dir):
    pool = Pool(cpu_count())
    tupled = [(loc[0], loc[1], local_dir) for loc in image_locations]
    try:
        pool.map(fetch_imagery_uncurried, tupled)
    finally:
        pool.close()
Beispiel #13
0
        def compress_file(self,corpus, np=4,separator=None):
                """
		construct WLZW pattern out of a corpus, parallelism is an option
		@param corpus - string, file path of the corpus
		@param np - number of processes, if np = 1 the algorithm is run in serial
		@param separator - the separator string to separate doc id and document. pass None if no doc id is given
		@return set, the final set containing all frequent patterns
		"""

                #if only one process, no need for parallelization
                if np==1:
                        return set(_compress_file((corpus,0,np,separator)))

                p=Pool(processes=np)
                l=[]
                for i in range(0,np):
                        l.append((corpus,i,np,separator))
                result=p.imap_unordered(_compress_file,l,1)

                if np==1:
                        final_set=result.next()
                else:
                        final_set=_union(result)

                return final_set
def main():
    """
    ---------------------------------------------------------------------------
    AUTHOR: Kyle Hernandez
    EMAIL: [email protected]

    Calculate the distribution of polymorphic RAD loci across site classes.
    ---------------------------------------------------------------------------

    USAGE: python snp_locations.py gmatrix.tab file.gff out.tab n_threads

    ARGUMENTS:
    	gmatrix.tab - Tab-delimited genotype matrix file of variant sites
        file.gff    - GFF file
        out.tab     - Output file of counts
        n_threads   - The number of threads to run
    """

    # Load the GFF and SNP positions into dictionaries
    load_gff()
    intergenic = process_matrix()
    
    # Map:
    # Create a pool of n_threads workers and use them to process
    # scaffolds separately
    ch_vals = sorted(gff_dict.keys())
    sys.stdout.write("Counting features...\n")
    pool    = Pool(processes=n_threads)
    ct_list = pool.map(process_dicts, ch_vals)

    # Reduce:
    # Process the list of dicts
    print_counts(intergenic, ct_list)
Beispiel #15
0
def spawn_runpy(cp, wait=60, cb=check_rst):
    "as decorator to run job"
    global WAITQ, RUNQ, CFG
    pool = Pool(processes=CFG['MAXJOBS'])
    while len(WAITQ) > 0 or len(RUNQ) > 0:
        if len(RUNQ) <= CFG['MAXJOBS'] and len(WAITQ) > 0:
            path, test = WAITQ.pop()
            rst = pool.apply_async(call_runpy, (cp, path, test,))
            RUNQ.append((rst, test, timeit.default_timer()))
        else:
            for r in RUNQ:
                usec = float("%.2f" %(timeit.default_timer()-r[2]))
                if r[0].successful:
                    print "[{0}] success used {1} usec".format(r[1], usec)
                    RUNQ.remove(r)
                    if cb:
                        cb(r[1], 'pass', usec)
                else:
                    if usec > CFG['TIMEOUT']:
                        print "[{0}] unsuccess used timeout {1} usec".format(r[1], usec)
                        r[0].terminate()
                        if cb:
                            cb(r[1], 'fail', usec)

        time.sleep(float(wait))
Beispiel #16
0
def mass_tri_plot(data, savedir, name='plot', Type='speed', Map=False):
    """
    Plots all time series.  Makes use of multiprocessing for speed.
    """
    trigrid = data['trigrid']
    #get the data to plot
    try:
        toPlot = data[Type]
    except KeyError:
        print Type + " is not an element of data.  Please calculate it."
        raise Exception("Invalid dictionary entry")
    #set the variable as a global variable
    global plotvar 
    plotvar = toPlot
    global saveDir
    saveDir = savedir
    global grid
    grid = trigrid
    #see if the save directory exists, or make it
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    l = toPlot.shape[0]
    
    p = Pool(4)
    plt.gca().set_aspect('equal')
    p.map(save_plot, range(50))
    clearall()
Beispiel #17
0
  def score_all_genes(self, graph, num_procs=1):
    partial_score_gene = partial(score_gene, graph=graph, top_genes=self.top_genes)
    p = Pool(num_procs)
    result = p.map(partial_score_gene, list(self.vd.gene_names()))
    p.close()

    # convert them all to percentiles
    cent_hist = numpy.array([x[1] for x in result if x[1] != -1])
    nn_hist = numpy.array([x[2] for x in result if x[2] != -1])

    batch = []

    for gene, cent_score, nn_score in result:
      # edge case: gene is a top gene
      if gene in self.top_genes:
        cent_perc = 1
        nn_perc = 1
      # edge case: gene isn't in network
      elif cent_score == -1 or \
           nn_score == -1:
        cent_perc = 0
        nn_perc = 0
      else:
        cent_perc = scipy.stats.percentileofscore(cent_hist, cent_score) / 100.0
        nn_perc = 1 - scipy.stats.percentileofscore(nn_hist, nn_score) / 100.0

        print "gene:  %s\n  c:   %s\n  c_p: %s\n  n:   %s\n  n_p: %s" % \
          (gene, cent_score, cent_perc, nn_score, nn_perc)

      batch.append((cent_score, cent_perc, nn_score, nn_perc, gene))

    self.vd._c.executemany("UPDATE genes SET cent_score = ?, cent_perc = ?, " \
      "nn_score = ?, nn_perc = ? WHERE name = ?", batch)
    self.vd._conn.commit()
def updateTranslation(args):
    # Get map that contains (besides other stuff)
    #  the crowdin ID for a given file
    translationFilemap = getTranslationFilemapCache(args.language, args.force_filemap_update)

    # Collect valid downloadable files for parallel processing
    fileinfos = []
    for filename, fileinfo in translationFilemap.items():
        filepath = os.path.join("cache", args.language, fileinfo["path"])
        # Create dir if not exists
        try: os.makedirs(os.path.dirname(filepath))
        except OSError as exc:
            if exc.errno == errno.EEXIST:
                pass
            else:
                raise
        fileid = fileinfo["id"]
        fileinfos.append((fileid, filepath))
    # Curry the function with the language
    performDownload = functools.partial(performPOTDownload, args.language)
    # Perform parallel download
    if args.num_processes > 1:
        pool = Pool(args.num_processes)
        pool.map(performDownload, fileinfos)
    else:
        for t in fileinfos:
            performDownload(t)
    #Set download timestamp
    timestamp = datetime.datetime.now().strftime("%y-%m-%d %H:%M:%S")
    with open("lastdownload.txt", "w") as outfile:  
        outfile.write(timestamp)
Beispiel #19
0
def multi_mode(start, stop):
    print "going multi"
    from multiprocessing import Pool

    pool = Pool(processes=4)
    result = pool.map(factorize, xrange(start, stop + 1), chunksize=100)
    print uniq_counter(result)
Beispiel #20
0
def compute_tdbf():
    conn = db_conn('bnc')
    cur = conn.cursor()
    # select keys and parsed from table
    sql = 'SELECT xmlID, divIndex, globalID, parsed FROM entropy_DEM100'
    cur.execute(sql)
    data = cur.fetchall()
    # initialize
    pool = Pool(multiprocessing.cpu_count())
    manager = Manager()
    queue = manager.Queue()
    # mp
    args = [(d, queue) for d in data]
    result = pool.map_async(compute_tdbf_worker, args, chunksize=5000)
    # manager loop
    while True:
        if result.ready():
            print('\n all rows processed')
            break
        else:
            sys.stdout.write('\r{}/{} processed'.format(queue.qsize(), len(args)))
            sys.stdout.flush()
            time.sleep(1)
    # update
    processed_results = result.get()
    for i, res in enumerate(processed_results):
        xml_id, div_idx, g_id, sub_tree, td, bf = res
        sql = 'UPDATE entropy_DEM100 SET parsedSimple = %s, td = %s, bf = %s WHERE xmlID = %s AND divIndex = %s AND globalID = %s'
        cur.execute(sql, (sub_tree, td, bf, xml_id, div_idx, g_id))
        if i % 999 == 0 and i > 0:
            sys.stdout.write('\r{}/{} updated'.format(i+1, len(processed_results)))
            sys.stdout.flush()
    conn.commit()
Beispiel #21
0
def get_needle_tips(images):
    """Get sample tips from images."""
    tips = []
    results = []

    # Do not make more processes than needed for the number of images.
    if len(images) > multiprocessing.cpu_count():
        proc_count = multiprocessing.cpu_count()
    else:
        proc_count = len(images)

    pool = Pool(processes=proc_count)

    for image in images:
        results.append(pool.apply_async(_get_ellipse_point,
                                        args=(image,)))

    for result in results:
        tip = result.get()
        if tip is not None:
            tips.append(tip)

    if len(tips) == 0:
        raise ValueError("No sample tip points found.")

    return tips
Beispiel #22
0
def main():

    parser = ArgumentParser(description="Speed up your SHA. A different hash style.")
    parser.add_argument('-1', '--sha1', action='store_true')
    parser.add_argument('-2', '--sha224', action='store_true')
    parser.add_argument('-3', '--sha256', action='store_true')
    parser.add_argument('-4', '--sha384', action='store_true')
    parser.add_argument('-5', '--sha512', action='store_true')
    parser.add_argument('-f', '--file', type=str, help="The path to the file")

    if len(sys.argv) == 1:
        parser.print_help()
        return

    global args
    args = parser.parse_args()

    hashtree = ''

    big_file = open(args.file, 'rb')
    pool = Pool(multiprocessing.cpu_count())

    for chunk_hash in pool.imap(hashing, chunks(big_file)):
        hashtree += chunk_hash + ":hash"

    pool.terminate()

    print(str(hashing(hashtree.encode('ascii'))))
Beispiel #23
0
    def get(self):
        mode = toAlpha3Code(self.get_argument('lang'))
        text = self.get_argument('q')
        if not text:
            self.send_error(400, explanation='Missing q argument')
            return

        def handleCoverage(coverage):
            if coverage is None:
                self.send_error(408, explanation='Request timed out')
            else:
                self.sendResponse([coverage])

        if mode in self.analyzers:
            pool = Pool(processes=1)
            result = pool.apply_async(getCoverage, [text, self.analyzers[mode][0], self.analyzers[mode][1]])
            pool.close()

            @run_async_thread
            def worker(callback):
                try:
                    callback(result.get(timeout=self.timeout))
                except TimeoutError:
                    pool.terminate()
                    callback(None)

            coverage = yield tornado.gen.Task(worker)
            handleCoverage(coverage)
        else:
            self.send_error(400, explanation='That mode is not installed')
Beispiel #24
0
class JobPool(object):

    """
    Pool container.
    """
    pool = None
    message_queue = None

    def __init__(self, max_instances=4):
        self.message_queue = Queue()
        self.pool = Pool(max_instances, execute_task, (self.message_queue,))
        atexit.register(self.clear)

    def add_analysis(self, analysis):
        """
        Add analysis to the pool.
        """
        analysis.set_started()
        self.message_queue.put(analysis)

    def clear(self):
        """
        Pool cleanup.
        """
        self.pool.terminate()
        self.pool.join()
Beispiel #25
0
class YaraJobPool(object):

    """
    Yara pool container.
    """
    pool = None
    message_queue = None

    def __init__(self, max_instances=3):
        self.message_queue = Queue()
        self.pool = Pool(max_instances, execute_yara_task,
                         (self.message_queue,))
        atexit.register(self.clear)

    def add_yara_task(self, yara_task):
        """
        Adds the yara task.
        """
        self.message_queue.put(yara_task)

    def clear(self):
        """
        Pool cleanup.
        """
        self.pool.terminate()
        self.pool.join()
Beispiel #26
0
def get_fractional_errors(R_star, L_star, P_c, T_c):
	"""
		Pass in "guess" conditions.
		Will then calculate inward and outward errors,

		Returns:
			[Data array]
			dY - over/undershoots (+/-, going outward)
				[dx handled outside this]
	"""

	# R_star, L_star, P_c, T_c = x

	P_c_0		= modelparameters.P_c # core pressure, [dyne cm^-2]
	T_c_0 		= modelparameters.T_c # core temperature, [K]
	R_star_0 	= modelparameters.R_star
	L_star_0 	= modelparameters.L_star

	print ""
	print "R: " + str(R_star / R_star_0)
	print "L: " + str(L_star / L_star_0)
	print "P: " + str(P_c / P_c_0)
	print "T: " + str(T_c / T_c_0)


	X 		= modelparameters.X
	Y 		= modelparameters.Y
	Z 		= modelparameters.Z
	mu 		= modelparameters.mu
	params 	= (X, Y, Z, mu)

	M_star 	= modelparameters.M_star
	m_fitting_point	= modelparameters.m_fitting_point

	pool = Pool(2)
	outward_results = pool.apply_async(integrate.integrate_outwards, 
		[M_star, m_fitting_point, P_c, T_c, mu, X, Y, Z] )

	inward_results  = pool.apply_async(integrate.integrate_inwards, 
		[M_star, m_fitting_point, R_star, L_star, mu, X, Y, Z] )

	m_outward, y_outward, infodict_outward 	= outward_results.get()

	m_inward, y_inward, infodict_inward 	= inward_results.get()

	dr = y_inward[-1,0] - y_outward[-1,0]
	dl = y_inward[-1,1] - y_outward[-1,1]
	dP = y_inward[-1,2] - y_outward[-1,2]
	dT = y_inward[-1,3] - y_outward[-1,3]

	dY = np.array([dr, dl, dP, dT])

	print ''
	print 'fractional errors:'
	print "dR: " + str(dr / y_inward[-1,0])
	print "dL: " + str(dl / y_inward[-1,1])
	print "dP: " + str(dP / y_inward[-1,2])
	print "dT: " + str(dT / y_inward[-1,3])

	return dY
    def get_location(self):
        """

        Extracts the location of each pixel in the satellite image

        """
        self.ncols = self.satellite_gdal.RasterXSize / 2
        self.nrows = self.satellite_gdal.RasterYSize / 2
        self.length_df = self.nrows * self.ncols
        print 'Columns, rows', self.ncols, self.nrows
        cols_grid, rows_grid = np.meshgrid(
                    range(0, self.ncols), 
                    range(0, self.nrows))
        self.cols_grid = cols_grid.flatten()
        self.rows_grid = rows_grid.flatten()
        print 'Checking the meshgrid procedure works'
        # getting a series of lat lon points for each pixel
        self.geotransform = self.satellite_gdal.GetGeoTransform()
        print 'Getting locations'
        self.location_series = np.array(parmap.starmap(
                        pixel_to_coordinates, 
                        zip(self.cols_grid, self.rows_grid), 
                        self.geotransform,
                        processes = self.processes))
        print 'Converting to Points'
        pool = Pool(self.processes)
        self.location_series = pool.map(
                        point_wrapper, 
                        self.location_series)
def main(path, out, cores):
    """
    Compute contact energies for each pdb in path and write results to 'out'.
    :param path: str
    :param out: str
    :param cores: int
    :return: None
    """
    # Find all pdbs in path
    workload = []
    for file in os.listdir(path):
        if os.path.splitext(file)[1].lower() == ".pdb":
            workload.append(file)
    # Print few newlines to prevent progressbar from messing up the shell
    print("\n\n")
    # Compute energies
    pool = Pool(processes=cores)
    results = []
    for (nr, pdb) in enumerate(workload):
        updateprogress(pdb, nr / len(workload))
        e = computecontactenergy(os.path.join(path, pdb), pool)
        results.append((pdb, e))
    pool.close()
    # Make 100% to appear
    updateprogress("Finished", 1)
    # Store output
    with open(out, "w") as handler:
        handler.write("PDB,Energy in kcal/mol\n")
        for pair in results:
            handler.write("{},{}\n".format(*pair))
Beispiel #29
0
def process_articles(entity_type=Entity, output_filename='output-all.txt',
                     corpus_root='corpus/'):
    terms = select_terms(entity_type)
    
    Session.expunge_all()
    Session.close()
    
    articles = Session.query(Entity.sep_dir).filter(Entity.sep_dir!=None)
    articles = articles.filter(Entity.sep_dir!='')
    articles = articles.distinct().all()
    articles = [a[0] for a in articles]
   
    # parallel processing of articles
    p = Pool()
    args = [(title, terms, entity_type, None, corpus_root) for title in articles]
    doc_lines = p.map(process_wrapper, args)
    p.close()

    #serial processing for tests
    '''
    doc_lines = []
    for title in articles:
        lines = process_article(title, terms, entity_type, None, corpus_root)
        doc_lines.append(lines)
    '''

    # write graph output to file
    print output_filename
    with open(output_filename, 'w') as f:
        for lines in doc_lines:
            f.writelines(lines)
def matrix_vector_iteration_by_processes(A,x,k):
	# create a temporary directory to store the matrix and the vectors
	tmpdir = tempfile.mkdtemp()

	nvec = get_nvec(x)
	y = x.copy()

	save_matrix(tmpdir,A)
	for i in xrange(nvec):
		save_x(tmpdir,x,i)

	# start processes
	pool = Pool(processes=min(nvec,6))
	processes = []

	for i in xrange(nvec):
		processes.append( pool.apply_async(matrix_vector_iteration_process, (tmpdir,i,k)) ) 

	# fetch results (vector/matrix shape version)
	if x.ndim  == 1:
		processes[0].get()
		y = load_x(tmpdir,0)
	else:
		for i in xrange(nvec):
			processes[i].get()
			y[:,i] = load_x(tmpdir,i)

	pool.close()

	# remove temporary directory (with all it contains)
	shutil.rmtree(tmpdir)

	return y
Beispiel #31
0
def learn_failures((X, Y, L)):
    def func(failures):
        ret = -maximum_likelihood_helpers.func(X,Y,failures)
        return ret

    def grad(failures):
        ret = -maximum_likelihood_helpers.grad(X,Y,failures)
        return ret

    X0 = np.log(np.ones(L+1, dtype=float)*(1-1e-6)) #initialize a small distance away from the bound
    bounds = [(None, 0) for _ in xrange(L+1)] 
    bounds[-1] = (None, np.log(1-1e-6)) #never allow the leak to go too close to 0
    failures, _, _ = opt.fmin_l_bfgs_b(func, X0, grad, bounds=bounds, disp=0)
    failures = np.exp(failures) #optimization was in log space. Exponentiate to get values.
    return failures

if __name__ == '__main__':

    pool = Pool(20)
    L = 50 # latent variables
    M = 300 # observations
    N = 1000 # patients
    X_matrix = np.array(np.random.rand(N, M) > 0.7, dtype=int)
    Y_matrix = np.array(np.random.rand(N, L) > 0.8, dtype=int)
    Y = [np.nonzero(y)[0].tolist() for y in Y_matrix]
    targets = ((X_matrix[:, j].tolist(), Y, L) for j in xrange(M))
    failures = np.array(pool.map(learn_failures, targets))
    print failures.shape

def parse(country, cc):
    print(f'Start: {country}')
    path = f'data/{country}'

    emissions = parse_xs(f'{path}/{os.listdir(path)[-1]}')
    energy = parse_energy(f'{cc}')

    print(f'Done: {country}')
    return country, [[u'CO₂e'] + emissions]             \
                  + list(zip(energy_params, *energy))   \
                  + [['years'] + list(range(1990, 2016))]



if __name__ == '__main__':
    if sys.version_info < (3, 6):
        sys.exit('Python 3.6 or later is required.\n')

    with open('countries.json') as f:
        countries = json.load(f)


    with Pool(processes=20) as pool:
        res = pool.starmap(parse, [(c, countries[c]) for c in awesome + f****d])


    data = { c: v for c, v in res }
    data['Kazakhstan'] = data.pop('Kazakhstan2')

    with io.open(f'renewable_to_emissions.min.json', 'w', encoding='utf8') as f:
        json.dump(data, f, ensure_ascii=False)
Beispiel #33
0
class Validation:

    def __init__(self, in_file, fp_db_name, output, categories, top, blacklist, malware_ctx_file, proc_list):
        if output == sys.stdout:
            self.out_file_pointer = sys.stdout
        else:
            self.out_file_pointer = open(output, 'w')
        self.categories = categories
        self.top = top
        self.blacklist = blacklist
        self.malware_ctx = None
        if malware_ctx_file != None:
            self.malware_ctx = {}
            for line in gzip.open(malware_ctx_file):
                fp_ = json.loads(line)
                self.malware_ctx[fp_['str_repr']] = fp_

        self.proc_list = None
        if proc_list != None:
            self.proc_list = []
            t_ = proc_list.split(';')
            for s in t_:
                if s != '':
                    tmp_proc_list = s.split(',')
                    self.categories.append(tmp_proc_list[0])
                    self.proc_list.append(tmp_proc_list)

        # read in application categories
        app_cat_file = 'application_categories.json.gz'
        with gzip.open(app_cat_file,'r') as fp:
            self.app_cat_data = json.loads(fp.read())

        self.mt_pool = Pool(32)

        self.input_file = in_file
        if in_file.endswith('.csv.gz'):
            self.data = self.read_file_csv(in_file)
        elif in_file.endswith('.json.gz') and 'dmz' in in_file:
            self.data = self.read_file_dmz_json(in_file)
        elif in_file.endswith('.json.gz'):
            self.data = self.read_file_json(in_file)
        else:
            print('error: file format not supported')
            sys.exit(-1)


    def validate_process_identification(self):
        results = []
        unknown_fp = 0
        unknown_s  = 0

        if self.top:
            results = self.mt_pool.map(get_results_top, [self.data[k] for k in self.data])
        elif self.blacklist:
            results = self.mt_pool.map(get_results_blacklist, [self.data[k] for k in self.data])
        else:
            results = self.mt_pool.map(get_results, [self.data[k] for k in self.data])
#            for k in self.data:
#                results.append(get_results(self.data[k]))

        self.data = None

        self.analyze_results(results)


    def analyze_results(self, results):
        r_tmp_ = self.mt_pool.map(process_result, [(sl, self.categories) for sl in results])
        r_tmp_ = [x for sl in r_tmp_ for x in sl]
        r_ = [sum([row[i] for row in r_tmp_]) for i in range(0,len(r_tmp_[0][:-1]))]

        print('FILE: %s' % self.input_file)
        print('\tTotal:\t\t\t\t    % 8i' % r_[0])
        print('\t                              :\t      top-1    top-2    top-3    top-4    top-5')
        print('\tProcess Name Category Accuracy:\t    %0.6f %0.6f %0.6f %0.6f %0.6f' % (r_[2]/r_[0], (r_[2]+r_[5])/r_[0], (r_[2]+r_[5]+r_[7])/r_[0], (r_[2]+r_[5]+r_[7]+r_[9])/r_[0], (r_[2]+r_[5]+r_[7]+r_[9]+r_[11])/r_[0]))
        print('\tProcess Name Accuracy:\t\t    %0.6f %0.6f %0.6f %0.6f %0.6f' % (r_[1]/r_[0], (r_[1]+r_[4])/r_[0], (r_[1]+r_[4]+r_[6])/r_[0], (r_[1]+r_[4]+r_[6]+r_[8])/r_[0], (r_[1]+r_[4]+r_[6]+r_[8]+r_[9])/r_[0]))
#        print('\tSHA-256 Accuracy:\t\t    %0.6f' % (r_[3]/r_[0]))

        r_c = [row[-1] for row in r_tmp_]
        idx = 0
        for c in self.categories:
            if c == '':
                continue
            r_ = [sum([row[idx][i] for row in r_c]) for i in range(0,len(r_c[0][0]))]
            print('\n\t%s Accuracy:\t\t    %0.6f' % (c, (r_[1]/r_[0])))
            print('\t%s Confusion Matrix:' % c)
            print('\t\t\t   Positive       Negative')
            print('\t\tPositive:% 9i\t% 9i' % (r_[2], r_[5]))
            print('\t\tNegative:% 9i\t% 9i' % (r_[4], r_[3]))
            if r_[2]+r_[5] > 0:
                print('\t\tRecall:    %0.6f' % (r_[2]/(r_[2]+r_[5])))
            else:
                print('\t\tRecall:    %0.6f' % (0.0))
            if r_[2]+r_[4] > 0:
                print('\t\tPrecision: %0.6f' % (r_[2]/(r_[2]+r_[4])))
            else:
                print('\t\tPrecision: %0.6f' % (0.0))

            idx += 1


    def read_file_csv(self, f):
        data = {}

        max_lines = 30000000
        cur_line  = 0

        start = time.time()
        for line in os.popen('zcat %s' % (f)):
            cur_line += 1
            if cur_line > max_lines:
                break
#            if '(0000)' not in line:
#                continue
            t_          = line.strip().split(',')
            src         = t_[0]
            proc        = t_[3]
            sha256      = t_[4]
            type_       = t_[5]
            fp_str      = t_[6].replace('()','')
            dst_x       = t_[7].split(')')
            os_         = clean_os_str(t_[8])
            if os_ == None:
                continue

            dst_ip      = dst_x[0][1:]
            dst_port    = int(dst_x[1][1:])
            server_name = dst_x[2][1:]
            src_port    = int(t_[9].split(')')[1][1:])
            av_hits     = 0
            if len(t_) > 10:
                av_hits = int(t_[10])

            proc = clean_proc_name(proc)

            if proc in uninformative_proc_names:
                continue

            fp_malware_ = False
            if self.malware_ctx != None:
                if fp_str in self.malware_ctx:
                    fp_malware_ = is_fp_malware(self.malware_ctx[fp_str])
                else:
                    continue

            app_cat = None
            if proc in self.app_cat_data:
                app_cat = self.app_cat_data[proc]
            malware = is_proc_malware({'process':proc}, fp_malware_, av_hits)
            domain = server_name
            sni_split = server_name.split('.')
            if len(sni_split) > 1:
                domain = sni_split[-2] + '.' + sni_split[-1]
            if server_name in sni_whitelist or domain in domain_whitelist:
                malware = False
            app_cats = {}
            app_cats['malware'] = malware
            for c in self.categories:
                if c == 'malware':
                    app_cats[c] = malware
                else:
                    app_cats[c] = False
                    if c == app_cat:
                        app_cats[c] = True

            if os_ == None:
                continue

            if src not in data:
                data[src] = []

            data[src].append((src,src_port,proc,sha256,type_,fp_str,dst_ip,dst_port,server_name,1,os_,app_cats, self.proc_list))

        print('time to read data:\t%0.2f' % (time.time()-start))

        return data


    def read_file_json(self, f):
        data = {}

        start = time.time()
        key_ = 0
        data[key_] = []
        for line in os.popen('zcat %s' % (f)):
            fp_ = json.loads(line)
            if 'str_repr' in fp_:
                fp_str = fp_['str_repr']
            else:
                fp_str = fp_['md5']

            if 'process_info' in fp_:
                new_procs = []
                fp_malware_ = is_fp_malware(fp_)
                for p_ in fp_['process_info']:
                    if 'process' not in p_:
                        p_['process'] = p_['filename']
                    p_['process'] = clean_proc_name(p_['process'])
                    if is_proc_malware(p_, fp_malware_):
                        new_procs.extend(clean_malware_proc(p_))
                    else:
                        new_procs.append(p_)
                fp_['process_info'] = new_procs


                for p_ in fp_['process_info']:
                    proc = p_['process']
                    sha256 = p_['sha256']

                    if p_['process'] in uninformative_proc_names:
                        continue



                    # uncomment to classify non-top processes
#                    pn = proc
#                    pn = app_families[pn] if pn in app_families else pn
#                    if pn in ['Chromium','Firefox','Safari','Internet Explorer','Adobe Tools',
#                              'Microsoft Office','Cisco Webex','Cisco AMP','iCloud','Box']:
#                        continue




                    app_cat = None
                    if proc in self.app_cat_data:
                        app_cat = self.app_cat_data[proc]
                    malware = is_proc_malware(p_, False)
                    app_cats = {}
                    app_cats['malware'] = malware
                    for c in self.categories:
                        if c == 'malware':
                            app_cats[c] = malware
                        else:
                            app_cats[c] = False
                            if c == app_cat:
                                app_cats[c] = True

                    for x_ in p_['dst_info']:
                        dst_x       = x_['dst'].split(')')
                        dst_ip      = dst_x[0][1:]
                        dst_port    = int(dst_x[1][1:])
                        server_name = dst_x[2][1:]
                        data[key_].append((None,None,proc,sha256,'tls',fp_str,dst_ip,dst_port,
                                           server_name,x_['count'],None,app_cats,self.proc_list))

                        if len(data[key_]) > 5000:
                            key_ += 1
                            data[key_] = []

        print('time to read data:\t%0.2f' % (time.time()-start))

        return data


    def read_file_dmz_json(self, f):
        data = {}

        key_ = 0
        data[key_] = []
        start = time.time()
        for line in os.popen('zcat %s' % (f)):
            fp_ = json.loads(line)
            if 'str_repr' in fp_:
                fp_str = fp_['str_repr']
            else:
                fp_str = fp_['md5']
            if fp_str in schannel_fps:
                fp_str = 'schannel'

            proc = 'dmz_process'
            sha256 = 'dmz_process'
            app_cats = {}
            app_cats['malware'] = False

#            if fp_str not in data:
#                data[fp_str] = []

            dst_info_key = 'dmz_dst_info'
            if dst_info_key not in fp_:
                dst_info_key = 'dst_info'

            for x_ in fp_[dst_info_key]:
                dst_x       = x_['dst'].split(')')
                dst_ip      = dst_x[0][1:]
                dst_port    = int(dst_x[1][1:])
                server_name = dst_x[2][1:]
#                data[fp_str].append((None,None,proc,sha256,'tls',fp_str,dst_ip,dst_port,
#                                     server_name,x_['count'],None,app_cats))
                data[key_].append((None,None,proc,sha256,'tls',fp_str,dst_ip,dst_port,
                                   server_name,x_['count'],None,app_cats,self.proc_list))

                if len(data[key_]) > 5000:
                    key_ += 1
                    data[key_] = []

        print('time to read data:\t%0.2f' % (time.time()-start))

        return data
    items = re.findall(pattern, html)

    itemdict = {}
    for item in items:
        itemdict['index'] = item[0]
        itemdict['name'] = item[1].strip()[3:]
        itemdict['score'] = item[2] + '.' + item[3]
        yield itemdict


def write_to_file(content):
    with codecs.open('maoyan.txt', 'a', 'utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False) + '\n')
        f.close()

def main(offset):

    url = 'http://maoyan.com/board/4?offset={}'.format(offset)
    html = get_one_page(url)
    for dictitem in parse_one_page(html):
        print dictitem
        write_to_file(dictitem)

if __name__ == '__main__':
    # for i in range(0,100,10):
    #     main(i)
    #
    pool = Pool()
    pool.map(main, range(0,100,10))

    # map(main, range(0, 100, 10))
Beispiel #35
0
    webpage = urlopen(req).read()
    html = bs(webpage, 'html.parser')
    return html

# get all club links under a url
def get_club_links(url):
    html = get_html(url)
    club_links = {elem.text: 'https://footballdatabase.com' + elem['href'] for elem in html.find_all('a', {'class': 'sm_logo-name clubbrowser-club'})}
    return club_links

# urls to club lists for all clubs starting with each letter
club_letter_urls = ['https://footballdatabase.com/clubs-list-letter/' + letter for letter in ascii_uppercase]

if __name__ == '__main__':

    pool = Pool(6)
    # get all club links for all club letter urls
    club_link_dicts = pool.imap(get_club_links, club_letter_urls)
    all_club_links = {}
    for club_link_dict in club_link_dicts:
        all_club_links.update(club_link_dict)
    for club in all_club_links:
        # add the lowercase name to the dictionary
        all_club_links[club.lower()] = all_club_links[club]
        # look for FC at the end or start of each club and add the name without FC
        if re.search('^FC\s+|\s+FC$', club):
            club_no_FC = re.sub('^FC\s+|\s+FC$', '', club)
            all_club_links[club_no_FC] = all_club_links[club]
            all_club_links[club_no_FC.lower()] = all_club_links[club]
    pool.close()
    pool.join()
                cslsdistance = []
                for layer in [0, 1, 2]:
                    rightside = batchvectors[tree_index - 1][layer][keys[3]]
                    distance.append(csd(leftside[layer], rightside))
                    rneigh_ind = knn_from_tree(neigh_tree[layer], tree_index,
                                               args.k)
                    rneigh = [
                        batchvectors[i][layer][keys[3]] for i in rneigh_ind
                        if i < len(batchvectors)
                    ]
                    cslsdistance.append(
                        csls(leftside[layer], rightside, lneigh[layer],
                             rneigh))
                return (vocab[tree_index - 1], distance, cslsdistance)

            with Pool(8) as p:
                distances = p.map(calc_distance, candidates)
            for element in distances:
                scores[element[0]] = element[1]
                cslsscores[element[0]] = element[2]
            is_better = np.array([0, 0, 0])
            csls_better = np.array([0, 0, 0])
            if correct in scores:
                for pick in scores:
                    is_better += [
                        scores[pick][i] <= scores[correct][i] for i in range(3)
                    ]
                    csls_better += [
                        cslsscores[pick][i] <= cslsscores[correct][i]
                        for i in range(3)
                    ]
    print(
        f'Loaded expression matrix of {ex_matrix.shape[0]} cells and {ex_matrix.shape[1]} genes in {end_time - start_time} seconds...',
        file=sys.stdout)

    tf_names = load_tf_names(args.tfs_fname.name)
    print(f'Loaded {len(tf_names)} TFs...', file=sys.stdout)

    ex_matrix, gene_names, tf_names = _prepare_input(ex_matrix, gene_names,
                                                     tf_names)
    tf_matrix, tf_matrix_gene_names = to_tf_matrix(ex_matrix, gene_names,
                                                   tf_names)

    print(f'starting {args.method} using {args.num_workers} processes...',
          file=sys.stdout)
    start_time = time.time()

    with Pool(args.num_workers) as p:
        adjs = list(
            tqdm.tqdm(p.imap(run_infer_partial_network,
                             target_gene_indices(gene_names,
                                                 target_genes='all'),
                             chunksize=1),
                      total=len(gene_names)))

    adj = pd.concat(adjs).sort_values(by='importance', ascending=False)

    end_time = time.time()
    print(f'Done in {end_time - start_time} seconds.', file=sys.stdout)

    adj.to_csv(args.output, index=False, sep="\t")
        venue_id = sys.argv[2]
        tips = save_tips(venue_id)
        print("Got %s tips for venue %s" % (len(tips), venue_id))

    elif argument == 'listen':
        # Set up the logging
        logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',
                            level=logging.INFO)

        # Set up the RabbitMQ channel
        # Tutorial: https://www.rabbitmq.com/tutorials/tutorial-six-python.html
        connection = pika.BlockingConnection(pika.ConnectionParameters(host=RABBITMQ_HOST))
        channel = connection.channel()
        channel.queue_declare(queue=RABBITMQ_QUEUE)

        channel.basic_qos(prefetch_count=1)
        channel.basic_consume(on_request,
                              queue=RABBITMQ_QUEUE)

        ts = int(time.time())
        pool = Pool(10)
        logging.info(' [x] Waiting for RPC requests...')

        channel.start_consuming()

    else:
        print("unknown argument - %s" % argument)
        sys.exit(0)


Beispiel #39
0
x = Variable()

objective = Minimize(quad_form(x, 1) + 1)
constraint = [quad_form(x, 1) - 6 * x + 8 <= u]
p = Problem(objective, constraint)


# Assign a value to gamma and find the optimal x.
def get_x(u_value):
    u.value = u_value
    result = p.solve()
    return x.value


u_values = np.linspace(-0.9, 10, num=50)
# Serial computation.
x_values = [get_x(value) for value in u_values]

# Parallel computation.
pool = Pool(processes=4)
x_values = pool.map(get_x, u_values)

# Plot the tradeoff curve
plot(u_values, x_values)
# label
title('Sensitivity Analysis: p*(u) vs u')
xlabel('u')
ylabel('p*(u)')
axis([-2, 10, -1, 3])
show()
Beispiel #40
0
from multiprocessing import Pool

def f(x):
    return x*x

if __name__ == '__main__':
    pool = Pool(processes=40)             
    result = pool.apply_async(f, [10])    
    print result.get(timeout=1)           
    print pool.map(f, range(10))         
        if '-c' in sys.argv:
            categories = GetCategories(sys.argv[sys.argv.index('-c')+1])    
        if '-topk' in sys.argv:
            topk = int(sys.argv[sys.argv.index('-topk')+1])
        if '-d' in sys.argv:
            abs_distance = float(sys.argv[sys.argv.index('-d')+1])
        if '-i' in sys.argv:
            unIoU = 1- float(sys.argv[sys.argv.index('-i')+1])

        real_file = GetFileList(sys.argv[1], [])
        predicted_file = GetFileList(sys.argv[2], [])
        real_txt_list = []
        predicted_txt_list = []
        for pth in real_file:
            real_txt_list += GetFileList(pth + '/txt',['.txt'])
        for pth in predicted_file:     
            predicted_txt_list += GetFileList(pth,['.txt'])

    pool = Pool(int(cpu_count()*7/8))
    results = pool.map(Experiment, zip(real_txt_list, predicted_txt_list))
    precision, recall, F_score, error_rate, missing_rate, over_detected_rate = PerformanceToCsv(results) 

    print("************************************************************************************************************")
    print("precision:{:.3f}; recall:{:.3f}; F_score:{:.3f}; error_rate:{:.3f}; missing_rate:{:.3f}; over_detected_rate:{:.3f}"\
          .format(precision, recall, F_score, error_rate, missing_rate, over_detected_rate))
    print("************************************************************************************************************") 
   
    toc = time.clock()
    print('running time: {:.3f} seconds'.format(toc-tic))
    
#pool = multiprocessing.Pool(4)
#out1, out2, out3 = zip(*pool.map(calc_stuff, range(0, 10 * offset, offset)))

from multiprocessing import Pool

def f(x):
    return x*x

if __name__ == '__main__':
    with Pool(5) as p:
        print(p.map(f, [1, 2, 3]))
from multiprocessing import Pool

def f(x):
    return x*x


# pool is used for parallesim for when multiple inputs are provided to the function,
# thus processing all the functions at the same time
if __name__ == '__main__':
    p = Pool(5)
    print(p.map(f, [1, 2, 3]))
Beispiel #44
0
        try:
            resp = re.search(r'resp=(.*),cost', lines[0])
        except IndexError as e :
            pass
        else:
            response = resp.group(1)
            if resp:
                ret = urllib.parse.unquote(response)
                return_rule = get_return_rule(json.loads(ret)['response'])
                result_queue.put_nowait(return_rule)
            else:
                print('没有response')


if __name__ == '__main__':
    pool = Pool(4)
    qid_queue = Manager().Queue()
    # result_queue = Manager().Queue()
    pool.apply_async(find_log, args=('query', 0, qid_queue,), callback=call_back)
    print('有一个线程去找qid了')
    while isinstance(qid_queue.get(), int):
        pool.apply_async(find_log, args=('responseServer', qid_queue.get_nowait(), qid_queue,), callback=call_back)
    pool.close()
    pool.join()
    index = 1
    while qid_queue.qsize() > 0:
        if not isinstance(qid_queue.get_nowait(), int) and index < 30:
            csv_file = open('travco_return_rule.csv', 'a', encoding='utf8')
            writer = csv.writer(csv_file)
            writer.writerow([qid_queue.get()])
            csv_file.close()
Beispiel #45
0
        train_ind_dict[i] = 1
        valid_ind_dict[j] = 1
        test_ind_dict[k] = 1
    with open(file_name,'r') as ref:
        for ind,line in enumerate(ref):
            tmpp = line.strip().split(',')[0]
            if ind in train_ind_dict.keys():
                train_set.write(tmpp+'\n')
            elif ind in valid_ind_dict.keys():
                valid_set.write(tmpp+'\n')
            elif ind in test_ind_dict.keys():
                test_set.write(tmpp+'\n')
    train_set.close()
    valid_set.close()
    test_set.close()
    
if __name__=='__main__':
    print(file_path,protein,data_directory)
    try:
        os.mkdir(file_path+'/'+protein+"/iteration_"+str(n_it))
    except:
        pass
    f_names = []
    for f in glob.glob(data_directory+'/*.txt'):
        f_names.append(f)

    t=time.time()
    with closing(Pool(np.min([tot_process,len(f_names)]))) as pool:
        pool.map(train_valid_test,f_names)
    print(time.time()-t)
Beispiel #46
0
if __name__ == '__main__':
    argss = [
        {   
            'prefix': 'hist_',
            'key': 'card_id',
            'num_aggregations': {
                'category_2_1': stats,
                'category_2_2': stats, 
                'category_2_3': stats,
                'category_2_4': stats, 
                'category_2_5': stats, 
                'category_3_0': stats,
                'category_3_1': stats,
                'category_3_2': stats, 
            }
        }
    ]

    pool = Pool(NTHREAD)
    callback = pool.map(aggregate, argss)
    pool.close()

#==============================================================================
utils.end(__file__)






def PTRC(init_code, p_error, p_sampling=None, droplets=4, Nc=None, steps=20000, conv_mult=2.0):
    p_sampling = p_sampling or p_error
    iters = 10

    if type(init_code) == list:
        # this is either 4 or 16, depending on what type of code is used.
        nbr_eq_classes = init_code[0].nbr_eq_classes
        # make sure one init code is provided for each class
        assert len(init_code) == nbr_eq_classes, 'if init_code is a list, it has to contain one code for each class'
        # store system_size for brevity
        size = init_code[0].system_size
        # if Nc is not provided, use code system_size
        Nc = Nc or size
        # initiate class ladders
        eq_ladders = [Ladder(p_sampling, eq_code, Nc) for eq_code in init_code]

    else:
        # this is either 4 or 16, depending on what type of code is used.
        nbr_eq_classes = init_code.nbr_eq_classes
        # store system_size for brevity
        size = init_code[0].system_size
        # if Nc is not provided, use code system_size
        Nc = Nc or size
        # convert init_code to every class and initiate ladders
        eq_ladders = [None] * nbr_eq_classes
        for eq in range(nbr_eq_classes):
            eq_code = copy.deepcopy(init_code)
            eq_code.qubit_matrix = eq_code.to_class(eq)
            eq_ladders[eq] = Ladder(p_sampling, eq_code, Nc)

    # reduce number of steps to account for parallel markov chains
    steps = steps // Nc
    # this is where we save all samples in a dict, to find the unique ones.
    qubitlist = {}
    # keep track of convergence
    conv_step = np.zeros(nbr_eq_classes)
    # keep track of shortest observed chains
    shortest = np.ones(nbr_eq_classes) * (2 * size ** 2)
    # keep track of when to stop if using convergence criteria
    stop = steps
    # inverse temperature when writing probability in exponential form
    beta_error = -log((p_error / 3) / (1 - p_error))
    # array of betas correspoding to ladder temperatures
    beta_ladder = -np.log((eq_ladders[0].p_ladder[:-1] / 3) / (1 - eq_ladders[0].p_ladder[:-1]))
    d_beta = beta_ladder - beta_error
    # Array to hold the boltzmann factors for every class
    Z_arr = np.zeros(nbr_eq_classes)

    # initiate worker pool
    if droplets > 1:
        pool = Pool(droplets)

    # do mcmc sampling, one class at a time
    for eq in range(nbr_eq_classes):

        if droplets == 1:
            unique_lengths_ladder, len_counts_ladder = PTRC_droplet(eq_ladders[eq], steps, iters, conv_mult)
        else:
            # ladder of lengths of qubit matrices
            unique_lengths_ladder = [{} for _ in range(Nc)]
            # ladder of observations of qubit matrix lengths
            len_counts_ladder = [{} for _ in range(Nc)]

            args = [(copy.deepcopy(eq_ladders[eq]), steps, iters, conv_mult) for _ in range(droplets)]
            output = pool.starmap_async(PTRC_droplet, args).get()

            # combine outputs
            for res in output:
                # iterate ladder
                for i in range(Nc):
                    unique_lengths_ladder[i].update(res[0][i])
                    # iterate output unique_lengths dictionary
                    for length, counts in res[1][i].items():
                        # aggregate or init length counter
                        if length in len_counts_ladder[i]:
                            # aggregate N(n)
                            len_counts_ladder[i][length][0] += counts[0]
                            # aggregate m(n)
                            len_counts_ladder[i][length][1] += counts[1]
                        else:
                            len_counts_ladder[i][length] = counts
        
        # iterate through all but top chain in ladder
        for i in range(Nc - 1):
            # sort len_counts by length
            sorted_counts = sorted(len_counts_ladder[i].items(), key=itemgetter(0))
            # make length and count array from sorted list
            lengths, counts = [np.array(lst) for lst in zip(*sorted_counts)]
            
            ## calculate C estimate for each length, count pair
            #C_ests = counts[:, 0] / counts[:, 1] * np.exp(-beta_ladder[i] * (lengths - lengths[0]))
            ## remove outlier estimates
            #tmp = C_ests[C_ests * 2 > C_ests[0]]
            ## calculate final estimate
            #C_mean = np.sqrt(np.mean(np.square(tmp))) # Root mean square so the average is "top-weighted"
            
            C_mean = np.mean(counts[:2, 0] / counts[:2, 1] * np.exp(-beta_ladder[i] * (lengths[:2] - lengths[0])))

            # calculate boltzmann factor from C estimate
            Z_est = C_mean * (counts[:, 1] * np.exp(lengths * d_beta[i] - beta_ladder[i] * lengths[0])).sum()
            # Accumulate boltzmann factor for equivalence class
            Z_arr[eq] += Z_est

    # Retrun normalized eq_distr
    return (Z_arr / np.sum(Z_arr) * 100).astype(np.uint8)#, conv_step
Beispiel #48
0
        'sun3d-mit_76_studyroom-76-1studyroom2',
        'sun3d-mit_lab_hj-lab_hj_tea_nov_2_2012_scan1_erika'
    ]
    # will evaluate the descriptor in `{desc_name}_{timestr}` folder.
    desc_name = sys.argv[1]
    timestr = sys.argv[2]
    # inlier_ratio = float(sys.argv[3])
    # distance_threshold = float(sys.argv[4])
    inlier_ratio = 0.05  # 5%
    distance_threshold = 0.10  # 10cm

    # multiprocessing to register each pair in each scene.
    # this part is time-consuming
    from multiprocessing import Pool

    pool = Pool(len(scene_list))
    func = partial(deal_with_one_scene, inlier_ratio, distance_threshold)
    pool.map(func, scene_list)
    pool.close()
    pool.join()

    # collect all the data and print the results.
    inliers_list = []
    recall_list = []
    inliers_ratio_list = []
    pred_match = 0
    gt_match = 0
    for scene in scene_list:
        # evaluate
        pcdpath = f"../data/3DMatch/fragments/{scene}/"
        resultpath = f"pred_result/{scene}/{desc_name}_result_{timestr}"
Beispiel #49
0
def read(path) -> DataFrame:
    def _clean(row):
        text = URL_REGEX.sub('', row.contents)
        if row.is_forward and '//@' in text:
            # 如果是转发的且格式正确
            if text.startswith('//@'):
                # 如果单纯转发,则内容设置为最原始微博的内容
                try:
                    text = FORWARD_CONTENT.findall(text)[-1]
                    i = FORWARD_SPLIT.match(text).regs[0][1]
                    text = text[i:]
                except IndexError:
                    text = text.replace('//@', '')  # TODO 可以用weibo的API处理
            else:
                # 否则截取新内容
                text = text[:text.find('//@')]
        return text

    temp_name = os.path.basename(path).replace('.xlsx', '')
    if os.path.isfile(cache_path(temp_name)):
        data, texts = load_cache(temp_name)
    else:
        output(f"===> Reading from <{path}>.")
        data: DataFrame = read_excel(path)  # .iloc[:280]

        # 只保留想要的4列,并去除空值,截取日期
        data = data[['contents', 'time', 'id', 'is_forward']].dropna().reset_index()
        data['date'] = data['time'].apply(lambda s: s[:10])
        data['contents'] = data['contents'].astype(str)

        # 预处理文本
        texts = data.apply(_clean, axis=1).to_list()
        dump_cache((data, texts), temp_name)
    output(f"===> got {len(data)} rows from <{path}>.")

    # 解析GPU ID
    ltp_ids = [i.strip() for i in _ARGS.ltpIDS.split(',')]
    skep_ids = [i.strip() for i in _ARGS.skepIDS.split(',')]

    # 初始化进程池,管理器,数据队列
    pool = Pool(1 + len(ltp_ids) + len(skep_ids))  # 分别分词、获取skep输入、skep运算
    manager = Manager()
    feqture_queue = manager.Queue(16 * len(skep_ids))
    result_queue = manager.Queue(16 * len(skep_ids))

    # 异步任务启动
    pool.apply_async(skep_producer, (feqture_queue, texts, 16, len(skep_ids)))
    tokens = dict()
    for i, (s, p) in zip(ltp_ids, generate_batch(texts, len(texts) // len(ltp_ids) + 1)):
        tokens[(s.start, s.stop)] = pool.apply_async(ltp_tokenzier, (p, 192, i))
    for i in skep_ids:
        pool.apply_async(skep_consumer, (feqture_queue, result_queue, i))

    # 接收结果
    scores, counter = zeros(len(texts)), 1
    while True:
        _slice, array = result_queue.get()
        # print(_slice)
        if array is None:
            if counter < len(skep_ids):
                counter += 1
            else:
                break
        else:
            scores[_slice] = array

    data['tokens'] = None
    for s, t in tokens.items():
        data['tokens'].update(Series(t.get(), range(*s)))
    data['sentiment_score'] = scores
    pool.close()
    pool.join()
    return data[['date', 'tokens', 'id', 'sentiment_score']]
def STRC(init_code, p_error, p_sampling=None, droplets=10, steps=20000, conv_mult=0):
    # set p_sampling equal to p_error by default
    p_sampling = p_sampling or p_error

    if type(init_code) == list:
        # this is either 4 or 16, depending on what type of code is used.
        nbr_eq_classes = init_code[0].nbr_eq_classes
        # make sure one init code is provided for each class
        assert len(init_code) == nbr_eq_classes, 'if init_code is a list, it has to contain one code for each class'
        # Create chains with p_sampling, this is allowed since N(n) is independet of p.
        eq_chains = [Chain(p_sampling, copy.deepcopy(code)) for code in init_code]
        # don't apply uniform stabilizers if low energy inits are provided
        randomize = False

    else:
        # this is either 4 or 16, depending on what type of code is used.
        nbr_eq_classes = init_code.nbr_eq_classes
        # Create chains with p_sampling, this is allowed since N(n) is independet of p.
        eq_chains = [None] * nbr_eq_classes
        for eq in range(nbr_eq_classes):
            eq_chains[eq] = Chain(p_sampling, copy.deepcopy(init_code))
            eq_chains[eq].code.qubit_matrix = eq_chains[eq].code.to_class(eq)
        # apply uniform stabilizers, i.e. rain
        randomize = True

    # error model
    beta_error = -log((p_error / 3) / (1 - p_error))
    beta_sampling = -log((p_sampling / 3) / (1 - p_sampling))
    d_beta = beta_sampling - beta_error

    # Array to hold the boltzmann factors for every class
    Z_arr = np.zeros(nbr_eq_classes)

    # Largest possible chain length
    max_length = 2 * eq_chains[0].code.system_size ** 2

    if droplets > 1:
        pool = Pool(droplets)

    # Iterate through equivalence classes
    for eq in range(nbr_eq_classes):
        chain = eq_chains[eq]

        # Start parallel processes with droplets.
        if droplets == 1:
            unique_lengths, len_counts, short_unique = STRC_droplet(copy.deepcopy(chain), steps, max_length, eq, randomize, conv_mult)
            shortest = next(iter(short_unique[0].values()))
            next_shortest = next(iter(short_unique[1].values()))
        else:
            args = [(copy.deepcopy(chain), steps, max_length, eq, randomize, conv_mult) for _ in range(droplets)]
            output = pool.starmap_async(STRC_droplet, args).get()

            # We need to combine the results from all raindrops
            unique_lengths = {}
            len_counts = {}
            short_unique = [{} for _ in range(2)]

            shortest = max_length
            next_shortest = max_length

            # Find shortest and next shortest length found by any chain
            for i in range(droplets):
                _,_,data = output[i]
                if next(iter(data[0].values())) < shortest:
                    next_shortest = shortest
                    shortest = next(iter(data[0].values()))
                if next(iter(data[1].values())) < next_shortest:
                    next_shortest = next(iter(data[1].values()))
            
            # Add data from each droplet to the combined dataset
            for i in range(droplets):
                # Unpack results
                unique_lengths_i, len_counts_i, short_unique_i = output[i]
                
                # Combine unique lengths ( not really needed? )
                unique_lengths.update(unique_lengths_i)

                # Combine len_counts
                for key in len_counts_i:
                    if key in len_counts:
                        len_counts[key] += len_counts_i[key]
                    else:
                        len_counts[key] = len_counts_i[key]
                
                # Combine the sets of shortest and next shortest chains
                shortest_i = next(iter(short_unique_i[0].values()))
                next_shortest_i = next(iter(short_unique_i[1].values()))

                if shortest_i == shortest:
                    short_unique[0].update(short_unique_i[0])
                if shortest_i == next_shortest:
                    short_unique[1].update(short_unique_i[0])
                if next_shortest_i == next_shortest:
                    short_unique[1].update(short_unique_i[1])

        # Partial result needed for boltzmann factor
        shortest_count = len(short_unique[0])
        shortest_fraction = shortest_count / len_counts[shortest]

        next_shortest_count = len(short_unique[1])
        
        # Handle rare cases where only one chain length is observed
        if next_shortest != max_length:
            next_shortest_fraction = next_shortest_count / len_counts[next_shortest]
            mean_fraction = 0.5 * (shortest_fraction + next_shortest_fraction * exp(-beta_sampling * (next_shortest - shortest)))
        
        else:
            mean_fraction = shortest_fraction

        # Calculate boltzmann factor from observed chain lengths
        Z_e = sum([m * exp(-beta_sampling * shortest + d_beta * l) for l, m in len_counts.items()]) * mean_fraction
        Z_arr[eq] = Z_e

    # Use boltzmann factors as relative probabilities and normalize distribution
    return (Z_arr / np.sum(Z_arr) * 100)
    def map_all_address_to_exchange(self):
        pool = Pool(WORKERS)#, self.init_worker)

        pool.map(self.map_address_to_exchange, exchangesPages)
def PTDC(init_code, p_error, p_sampling=None, droplets=4, Nc=None, steps=20000, conv_mult=0):
    p_sampling = p_sampling or p_error
    iters = 10

    if type(init_code) == list:
        # this is either 4 or 16, depending on what type of code is used.
        nbr_eq_classes = init_code[0].nbr_eq_classes
        # make sure one init code is provided for each class
        assert len(init_code) == nbr_eq_classes, 'if init_code is a list, it has to contain one code for each class'
        # store system_size for brevity
        size = init_code[0].system_size
        # if Nc is not provided, use code system_size
        Nc = Nc or size
        # initiate class ladders
        eq_ladders = [Ladder(p_sampling, eq_code, Nc) for eq_code in init_code]

    else:
        # this is either 4 or 16, depending on what type of code is used.
        nbr_eq_classes = init_code.nbr_eq_classes
        # store system_size for brevity
        size = init_code.system_size
        # if Nc is not provided, use code system_size
        Nc = Nc or size
        # convert init_code to ecery class and initiate ladders
        eq_ladders = [None] * nbr_eq_classes
        for eq in range(nbr_eq_classes):
            eq_code = copy.deepcopy(init_code)
            eq_code.qubit_matrix = eq_code.to_class(eq)
            eq_ladders[eq] = Ladder(p_sampling, eq_code, Nc)
        
    # reduce number of steps to account for parallel markov chains
    steps = steps // Nc
    # this is where we save all samples in a dict, to find the unique ones.
    qubitlist = {}
    # Z_E will be saved in eqdistr
    eqdistr = np.zeros(nbr_eq_classes)
    # keep track of convergence
    conv_step = np.zeros(nbr_eq_classes)
    # keep track of shortest observed chains
    shortest = np.ones(nbr_eq_classes) * (2 * size ** 2)
    # keep track of when to stop if using convergence criteria
    stop = steps
    # error-model
    beta = -log((p_error / 3) / (1 - p_error))

    # initiate worker pool
    if droplets > 1:
        pool = Pool(droplets)

    # Do mcmc sampling, one class at a time
    for eq in range(nbr_eq_classes):
        if droplets == 1:
            qubitlist = PTDC_droplet(eq_ladders[eq], steps, iters, conv_mult)
        else:
            args = [(copy.deepcopy(eq_ladders[eq]), steps, iters, conv_mult) for _ in range(droplets)]
            output = pool.starmap_async(PTDC_droplet, args).get()
            for res in output:
                qubitlist.update(res)

        # mcmc sampling for class is finished. calculate boltzmann factor
        for key in qubitlist:
            eqdistr[eq] += exp(-beta * qubitlist[key])
        qubitlist.clear()

    # Retrun normalized eq_distr
    return (np.divide(eqdistr, sum(eqdistr)) * 100).astype(np.uint8)#, conv_step
def build_save_dataset(corpus_type, fields, src_reader, cue_reader, tgt_reader,
                       opt):
    assert corpus_type in ['train', 'valid']

    if corpus_type == 'train':
        counters = defaultdict(Counter)
        srcs = opt.train_src
        cues = opt.train_cue
        tgts = opt.train_tgt
        ids = opt.train_ids
    elif corpus_type == 'valid':
        counters = None
        srcs = [opt.valid_src]
        cues = [opt.valid_cue]
        tgts = [opt.valid_tgt]
        ids = [None]

    src_vocab, tgt_vocab, existing_fields = maybe_load_vocab(
        corpus_type, counters, opt)

    existing_shards = check_existing_pt_files(opt, corpus_type, ids,
                                              existing_fields)

    # every corpus has shards, no new one
    if existing_shards == ids and not opt.overwrite:
        return

    def shard_iterator(srcs, cues, tgts, ids, existing_shards, existing_fields,
                       corpus_type, opt):
        """
        Builds a single iterator yielding every shard of every corpus.
        """
        for src, tgt, cue, maybe_id in zip(srcs, tgts, cues, ids):
            if maybe_id in existing_shards:
                if opt.overwrite:
                    logger.warning(
                        "Overwrite shards for corpus {}".format(maybe_id))
                else:
                    if corpus_type == "train":
                        assert existing_fields is not None,\
                            ("A 'vocab.pt' file should be passed to "
                             "`-src_vocab` when adding a corpus to "
                             "a set of already existing shards.")
                    logger.warning("Ignore corpus {} because "
                                   "shards already exist".format(maybe_id))
                    continue
            if ((corpus_type == "train" or opt.filter_valid)
                    and tgt is not None):
                filter_pred = partial(inputters.filter_example,
                                      use_src_len=opt.data_type == "text",
                                      max_src_len=opt.src_seq_length,
                                      max_tgt_len=opt.tgt_seq_length)
            else:
                filter_pred = None
            src_shards = split_corpus(src, opt.shard_size)
            cue_shards = split_corpus(cue, opt.shard_size)
            tgt_shards = split_corpus(tgt, opt.shard_size)
            for i, (ss, cs,
                    ts) in enumerate(zip(src_shards, cue_shards, tgt_shards)):
                yield (i, (ss, cs, ts, maybe_id, filter_pred))

    shard_iter = shard_iterator(srcs, cues, tgts, ids, existing_shards,
                                existing_fields, corpus_type, opt)

    with Pool(opt.num_threads) as p:
        dataset_params = (corpus_type, fields, src_reader, cue_reader,
                          tgt_reader, opt, existing_fields, src_vocab,
                          tgt_vocab)
        func = partial(process_one_shard, dataset_params)
        for sub_counter in p.imap(func, shard_iter):
            if sub_counter is not None:
                for key, value in sub_counter.items():
                    counters[key].update(value)

    if corpus_type == "train":
        vocab_path = opt.save_data + '.vocab.pt'
        if existing_fields is None:
            fields = _build_fields_vocab(
                fields, counters, opt.data_type, opt.share_vocab,
                opt.vocab_size_multiple, opt.src_vocab_size,
                opt.src_words_min_frequency, opt.cue_vocab_size,
                opt.cue_words_min_frequency, opt.tgt_vocab_size,
                opt.tgt_words_min_frequency)
        else:
            fields = existing_fields
        torch.save(fields, vocab_path)
Beispiel #54
0
    def run_program(self, dataset, parameters):
        self.logger.info("Starting run\nParameters:\n{}".format("\n".join(
            ["\t{}: {}".format(k, v) for k, v in parameters.items()])))
        self.logger.info(
            "Distributing load over {} cores".format(NUM_OF_WORKERS))

        kg_i, kg_s = dataset

        # fit model
        t0 = timer()

        # MP manager
        manager = Manager()

        # generate semantic item sets from sampled graph
        si_sets = manager.dict(generate_semantic_item_sets(kg_i))

        # generate common behaviour sets
        work = manager.Queue()
        keys = list(si_sets.keys())
        slices = self.diagonal_matrix_slicer(keys)

        cbs_sets = manager.list()
        pool = []
        for i in range(NUM_OF_WORKERS):
            p = Process(target=generate_common_behaviour_sets,
                        args=(si_sets, cbs_sets, work,
                              parameters["similarity_threshold"]))
            p.daemon = True
            p.start()
            pool.append(p)

        for slce in slices:
            work.put(slce)

        for p in pool:
            work.put(None)

        # join shared variables
        for p in pool:
            p.join()

        # extend common behaviour sets
        cbs_size = 2
        cbs_sets_extended = manager.list(cbs_sets)
        while cbs_size < parameters["max_cbs_size"]:
            func = partial(extend_common_behaviour_sets, cbs_sets_extended,
                           parameters["similarity_threshold"])

            slices = self.diagonal_matrix_slicer(cbs_sets_extended)
            cbs_sets_extention = manager.list()
            with Pool(processes=NUM_OF_WORKERS) as pool:
                it = pool.imap_unordered(func=func, iterable=slices)

                while True:
                    try:
                        cbs_subset = next(it)
                        cbs_sets_extention.extend(cbs_subset)
                    except StopIteration:
                        break

            cbs_sets.extend(cbs_sets_extention)
            cbs_sets_extended = cbs_sets_extention
            cbs_size *= 2

        # generate semantic item sets from sampled graph association rules
        rules = manager.list()
        work = manager.Queue()
        size = max(1, floor(len(cbs_sets) / NUM_OF_WORKERS))
        slices = [slice(i, i + size) for i in range(0, len(cbs_sets), size)]

        pool = []
        for i in range(NUM_OF_WORKERS):
            p = Process(target=generate_semantic_association_rules,
                        args=(kg_i, kg_s, cbs_sets, work, rules,
                              parameters["minimal_local_support"]))
            p.daemon = True
            p.start()
            pool.append(p)

        for slce in slices:
            work.put(slce)

        for p in pool:
            work.put(None)

        # join shared variables
        for p in pool:
            p.join()

        # calculate support and confidence, skip those not meeting minimum requirements
        final_rule_set = manager.list()
        work = manager.Queue()
        size = max(1, floor(len(rules) / NUM_OF_WORKERS))
        slices = [slice(i, i + size) for i in range(0, len(rules), size)]

        pool = []
        for i in range(NUM_OF_WORKERS):
            p = Process(target=evaluate_rules,
                        args=(kg_i, rules, work, final_rule_set,
                              parameters["minimal_support"],
                              parameters["minimal_confidence"]))

            p.daemon = True
            p.start()
            pool.append(p)

        for slce in slices:
            work.put(slce)

        for p in pool:
            work.put(None)

        # join shared variables
        for p in pool:
            p.join()

        # sorting rules on both support and confidence
        final_rule_set.sort(key=itemgetter(2, 1), reverse=True)

        # time took
        t1 = timer()
        dt = t1 - t0
        print("  Program completed in {:.3f} ms".format(dt))

        print("  Found {} rules".format(len(final_rule_set)))
        return final_rule_set
Beispiel #55
0
    patterns = [
        ''.join([random.choice("CD") for _ in lookup_table_keys])
        for i in range(number)
    ]

    # zip together the keys and the patterns to give a table
    tables = [dict(zip(lookup_table_keys, pattern)) for pattern in patterns]

    return tables


if __name__ == '__main__':
    arguments = docopt(__doc__, version='Lookup Evolver 0.1')

    # set up the process pool
    pool = Pool(processes=int(arguments['-i']))

    # vars for the genetic algorithm
    starting_pop = int(arguments['-k'])
    mutation_rate = float(arguments['-u'])
    generations = int(arguments['-g'])
    bottleneck = int(arguments['-b'])
    plys = int(arguments['-p'])
    start_plys = int(arguments['-s'])

    # generate a starting population of tables and score them
    # these will start off the first generation
    starting_tables = get_random_tables(plys, start_plys, starting_pop)
    real_starting_tables = axelrod_utils.score_tables(starting_tables, pool)

    # kick off the evolve function
Beispiel #56
0
            'score': item[5] + item[6]
        }
        '''
        print '\n'
        for i in item:
            print i
        '''


def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')
        f.close()


def main(offset):
    url = 'http://maoyan.com/board/4?offset=' + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        #print('writing'+item+'\n\n')
        write_to_file(item)
        #for i in item:
        #print i,':',item[i].strip()


if __name__ == '__main__':
    #for i in range(10):
    #    main(i*10)
    pool = Pool()
    pool.map(main, [i * 10 for i in range(10)])
def pool_handler():
    p = Pool(2)
    p.map(work_log, work)
        monconn_jobs_local_cur = monconn_jobs_local.getCursor()
        monconn_jobs_local.dropTable()
        print 'Connecting to Mongodb...finished'
        del(monconn_jobs_local)
        
        

        
        #########################################################################################################             
        ############----------------- Initiating Multiprocessing and extracting Jobs
        ############----------------- Set flag pprocessing = 1 for multiprocessing (avoid)
        #########################################################################################################                
        numChunks = 100
        chunkIDs = range(0, numChunks)   
        print chunkIDs
        
        pprocessing = 0
        
        if pprocessing == 0:
            preProcessChunk(1)
            #for chunkID in chunkIDs:
            #    preProcessChunk(chunkID)
            pass
        
        else:
            numConcurrentThreads = 5
            pool = Pool(numConcurrentThreads)
            pool.map(preProcessChunk, chunkIDs)
        
    except:
       send_email(['*****@*****.**', '*****@*****.**','*****@*****.**'],"Midout Mailers -Urgent!!!","Jobs Processing from SQL Failed!!!!!\nCall Akash (+91-8527716555) or Kanika (+91-9560649296) asap.")
Beispiel #59
0
def main():
    with Pool(processes=1, maxtasksperchild=2) as p:
        print(p.starmap_async(myTask, [(4, 3), (2, 1), (3, 2), (5, 1)]).get())
        print(p.starmap_async(myTask, [(4, 3), (2, 1), (3, 2), (2, 3)]).get())
Beispiel #60
0
p1.join
p2.join
print(p1)
print(p2)1
    
print("Done")


#using map reduce
from multiprocessing import Pool


def f(n):
    return n*n

p = Pool(processes=3)
result = p.map(f,[1,2,3,4,5])
for n in result:
    print(n)

#using multithread
import threading
t=time.time()

t1=threading.Thread(target=calculate_square, args=(arrs,))#just like the others target=function, args=arguments of function
t2=threading.Thread(target=calculate_cube, args=(arrs,))#same thing but or cube

t1.start#start process for thread 1
t2.start#start process for thread 2

t1.join()#stop thread 1