Exemple #1
0
def process_articles(entity_type=Entity, output_filename='output-all.txt',
                     corpus_root='corpus/'):
    terms = select_terms(entity_type)
    
    Session.expunge_all()
    Session.close()
    
    articles = Session.query(Entity.sep_dir).filter(Entity.sep_dir!=None)
    articles = articles.filter(Entity.sep_dir!='')
    articles = articles.distinct().all()
    articles = [a[0] for a in articles]
   
    # parallel processing of articles
    p = Pool()
    args = [(title, terms, entity_type, None, corpus_root) for title in articles]
    doc_lines = p.map(process_wrapper, args)
    p.close()

    #serial processing for tests
    '''
    doc_lines = []
    for title in articles:
        lines = process_article(title, terms, entity_type, None, corpus_root)
        doc_lines.append(lines)
    '''

    # write graph output to file
    print output_filename
    with open(output_filename, 'w') as f:
        for lines in doc_lines:
            f.writelines(lines)
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(description='Analyze a bandersnatch mirror.')
    parser.add_argument('--json',
                       help='save raw data to a json file',
                       default=None)
    args = parser.parse_args()
    concurrency = 8
    root = "/var/spool/pypi/web/packages/source/"
    p = Pool()
    results = {}
    try:
        try:
            for path, result in \
                p.imap_unordered(analyse_sdist, yield_packages(root)):
                results[path] = result
            p.close()
        except:
            p.terminate()
            raise
    finally:
        p.join()
    if args.json:
        with open(args.json, 'wb') as f:
            f.write(json.dumps(results))
    pprint.pprint(results)
Exemple #3
0
class withPool:
    def __init__(self, procs):
        self.p = Pool(procs, init_func)
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.p.close()
def enumerate_all_subgraphs_upto_size_k_parallel(document_graph, k, num_of_workers=4):
    """
    returns all subgraphs of a DiscourseDocumentGraph (i.e. a MultiDiGraph)
    with up to k nodes. This is a trivially parallelized version of
    enumerate_all_subgraphs_upto_size_k()
    """
    document_nodes = len(document_graph)
    if k > document_nodes:
        k = document_nodes

    int_graph = nx.convert_node_labels_to_integers(nx.DiGraph(document_graph),
                                                   first_label=1,
                                                   label_attribute='node_id')

    pool = Pool(processes=num_of_workers) # number of CPUs
    results = [pool.apply_async(enumerate_all_size_k_subgraphs, args=(int_graph, i))
                for i in xrange(1, k+1)]
    pool.close()
    pool.join()

    subgraphs = []
    for result in results:
        tmp_result = result.get()
        if isinstance(tmp_result, list):
            subgraphs.extend(tmp_result)
        else:
            subgraphs.append(tmp_result)
    return subgraphs
Exemple #5
0
def crawl_recursive_threaded(dirpath, ext):
    from database import indexer
    from database import utils
    from multiprocessing import Pool

    # convert to our infos
    cdir = indexer.DirInfo(dirpath, ext)
    cInfos = indexer.dirs_to_info(cdir.subfolders(), ext)

    # comment if you want a silent indexing
    print(cdir.to_string())

    # recursive pooled call
    # NOTE: child calls must not be pooled
    p = Pool(utils.Settings.config['processes'])
    infos = p.map(crawl_recursive, cInfos)
    p.close()

    # remove hierarchy
    dirInfos = [d for sublist in infos for d in sublist]
    dirInfos.append(cdir)

    print('I was crawling with %d processes' %
          utils.Settings.config['processes'])

    return dirInfos
Exemple #6
0
def main():
    global output_doc_path
    if output_doc_path == '':
        output_doc_path = os.path.join(os.path.split(input_doc_path)[0], 'outputTinypng')
    if not os.path.exists(output_doc_path):
        os.mkdir(output_doc_path)

    for parent,dirnames,filenames in os.walk(input_doc_path):    #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字
      for dirname in  dirnames:                       #输出文件夹信息
        # print("parent is:" + parent)
        # print("dirname is" + dirname)
        outDir = os.path.join(output_doc_path,os.path.relpath(os.path.join(parent,dirname),input_doc_path))
        if not os.path.exists(outDir):
            os.mkdir(outDir)

      for filename in filenames:                        #输出文件信息
        # print("parent is:" + parent)
        # print("filename is:" + filename)
        filePaths.append(os.path.join(parent,filename))

    pngFilePaths = filter(lambda x:os.path.splitext(x)[1]=='.png' or os.path.splitext(x)[1]=='.jpg',filePaths)
    print('Parent process %s.' % os.getpid())
    p = Pool(poolLimite)
    for fileName in pngFilePaths:
        p.apply_async(getTinyPng, args=(fileName,))
    print('Waiting for all subprocesses done...')
    p.close()
    p.join()
    print('All subprocesses done.')
Exemple #7
0
  def __decrypt_file(self, private_d, public_n, keys, path_to_file, CRT, k):
    if CRT:
      pool = Pool(processes = k)
      promises = []
    decrpted_data = ''
    with open(path_to_file, 'r') as f:
      encrypted_data = f.read()
      encrypted_data_chunks = list(map(''.join, zip(*[iter(encrypted_data)]*len(str(public_n)))))
      for i in range(len(encrypted_data_chunks)):
        stripped = encrypted_data_chunks[i].lstrip('0')
        if CRT:
          promise = pool.apply_async(self.compute_part_of_message, args=(stripped, keys, i))
          promises.append(promise)
        else:
          decrpted_data += chr(self.__decrypt_message(stripped, private_d, public_n))
    if CRT:
      results = [promise.get() for promise in promises]
      decrypted_sorted = sorted(results, key = lambda x: x[1])
      for data in decrypted_sorted:
        decrpted_data += chr(data[0])

    if CRT:
      pool.close()
    with open(path_to_file + '.dec', 'w') as f:
      f.write(decrpted_data)
    return decrpted_data
Exemple #8
0
def rc(rf, alphabet, numOfThreads):
	tryn=0
	counterTmp = 0
	printCounter = 1000
	listBasic = []
	if rf.endswith('.rar'):
		funcChosen = unrar
	elif rf.endswith('.zip') or rf.endswith('.7z') :
		funcChosen = zipFileUnzip
	for a in range(1,len(alphabet)+1):
		for b in itertools.product(alphabet,repeat=a):
			k="".join(b)
			k=re.escape(k)
			listBasic.append(k)
			tryn+=1
			if len(listBasic) == numOfThreads:
				pool = Pool(numOfThreads)
				pool.map_async(funcChosen, listBasic, callback = exitPass)
				pool.close()
				if resultPass:
					timeWasted = time.time()-start
					print 'Found! Password is '+resultPass
					print "It took " +str(round(time.time()-start,3))+" seconds"
					print "Speed: "+str(round(tryn/float(timeWasted),2))+" passwords/sec"
					print "Tried "+str(tryn)+" passwords"
					exit()
				listBasic = []
			counterTmp+=1
			if counterTmp >= printCounter:
				print 'Trying combination number '+str(tryn)+':'+str(k)
				timeWasted = round(time.time()-start,2)
				if timeWasted > 0:
					print "It took already " +str(timeWasted) +" seconds. Speed: "+str(round(tryn/float(timeWasted),2))+" passwords/sec"
				counterTmp=0
def build_from_queries(queries):
    p = Pool(5)
    query_results = p.map(q_exec, queries)
    p.close()
    p.join()
    #process the query_results
    return query_results
Exemple #10
0
def calcSynScores(scoresO,aabrhRawScoreSummmaryD,geneNames,geneOrderT,synWSize,numSynToTake,numThreads):
    '''Calculate the synteny score between two genes and add to edge
attributes of scoresO. We only bother making synteny scores for those
genes that have an edge in scoresO.
    '''
    
    neighborTL = createNeighborL(geneNames,geneOrderT,synWSize)

    # make list of groups of arguments to be passed to p.map. There
    # should be numThreads groups.
    argumentL = [([],neighborTL,numSynToTake,geneNames,aabrhRawScoreSummmaryD,scoresO) for i in range(numThreads)]

    i=0
    for gn1,gn2 in scoresO.iterateEdgesByEndNodes():
        argumentL[i%numThreads][0].append((gn1,gn2))
        i+=1

    p=Pool(numThreads) # num threads
    synScoresLL = p.map(synScoreGroup, argumentL)
    p.close()
    p.join()
    
    # add to scores object
    for synScoresL in synScoresLL:
        for gn1,gn2,sc in synScoresL:
            scoresO.addScoreByEndNodes(gn1,gn2,sc,'synSc')

    return scoresO
def expand_all_commits(code_dir, target_dir, only_year=None):
  print code_dir
  uname_lookup_by_year_q = load_uname_lookup_by_year_q()
  latest_submissions = get_latest_submissions(code_dir)
  num_students = len(latest_submissions)

  def get_commit_args(args):
    i, student = args
    latest_submit = latest_submissions[student]
    student_dir = os.path.join(code_dir, latest_submit)
    year_q = get_submit_time(student_dir) 
    if (not year_q) or only_year != year_q: return (-1,'','',-1,'',-1)
    year_target_dir = os.path.join(target_dir, year_q)
    if year_q not in uname_lookup_by_year_q or \
          latest_submit not in uname_lookup_by_year_q[year_q]:
        add_uname_to_lookup(latest_submit, year_q, uname_lookup_by_year_q)
    student_id = uname_lookup_by_year_q[year_q][latest_submit]
    #if student_id != '2012010247': return (-1,'','',-1,'',-1)
    return i, student, student_dir, student_id, year_target_dir, num_students

  students = sorted(latest_submissions.keys())
  zipped_args = map(get_commit_args, enumerate(students))
  non_students = [student for i, student in enumerate(students) if zipped_args[i][0] == -1]
  #print "unsuccessful"
  #print '\n'.join([latest_submissions[student] for student in non_students])
  pool = ThreadPool(8)
  results = pool.map(thread_process_commit, zipped_args)
  pool.close()
  pool.join()
  export_uname_lookup_by_year_q(uname_lookup_by_year_q)
Exemple #12
0
def import_images(folder, par=True, ttime=True):
  """
  This function loads images from a folder as PIL Image files and
  thresholds them, creating a list of z-slices to be turned into a matrix
  This version is not currently used.
  """
  fils = [os.listdir(folder)]
  def keep_tifs(rawlist):
    tiflist = []
    for f in rawlist:
      if len(f.split('.'))>1:
        if f.split('.')[1] == 'tif':
          tiflist.append(f)
    return tiflist
  tiflist = keep_tifs(fils)
  newtiflist = [folder+f for f in tiflist].sort() # alphabetize
  tifobjs = [load_img_array(f) for f in tiflist]
  
  # here start parallel stuff
  if par or ttime:
    start_time_par = timer()
    pool = Pool(8)
    results_par = pool.map(show_at_thresh, tifobjs)
    pool.close()
    pool.join()
    total_time_par = timer() - start_time_par
  # or non-parallel stuff
  elif par==False or ttime:
    start_time_nopar = timer()
    results_nopar = [show_at_thresh(f) for f in newtiflist]
    total_time_nopar = timer() - start_time_nopar
  print('Time for parallel: %.2f seconds' % total_time_par)
  print('Time for non-parallel: %.2f seconds' % total_time_nopar)
  
  return results_par, results_nopar
Exemple #13
0
def calcRawScores(fastaFilePath,numThreads,geneNames,gapOpen, gapExtend, matrix, scoresO):
    '''Get a global alignment based raw score for every edge in scoresO.'''

    # load sequences
    protFnL=glob.glob(fastaFilePath)
    seqD=genomes.loadProt(protFnL)
                
    # make list of sets of arguments to be passed to p.map. There
    # should be numThreads sets.
    argumentL = [([],seqD,gapOpen, gapExtend, matrix) for i in range(numThreads)]

    i=0
    for g1,g2 in scoresO.iterateEdgesByEndNodes():
        edgeNum = scoresO.endNodesToEdge(g1,g2)
        edgeT = edgeNum,geneNames.numToName(g1),geneNames.numToName(g2)
        argumentL[i%numThreads][0].append(edgeT)
        i+=1
        
    # run
    p=Pool(numThreads)
    scoresLL = p.map(rawScoreGroup, argumentL)
    p.close()
    p.join()

    
    # store in scoresO
    for scoresL in scoresLL:
        for edgeNum,sc in scoresL:
            scoresO.addScoreByEdge(edgeNum,sc,'rawSc')

    return scoresO
Exemple #14
0
def getData():
    if os.path.isfile("chat_urls.p"):
        chat_urls = pickle.load( open( "chat_urls.p", "rb" ) )
    else:
        chat_urls = {}
        for user in users:
            chat_urls[user] = get_urls(user)
        teams_url = "http://espn.go.com/mlb/teams"
        pickle.dump( chat_urls, open( "chat_urls.p", "wb" ) )

    # for user in chat_urls:
    #     urls = chat_urls[user]
    #     for url in urls:
    #         getLog(url)
    logDB = {}
    for user in chat_urls:
        logDB[user] = {}
    p = Pool(20)
    i=0
    manager = Manager()
    db = manager.dict()
    for user in chat_urls:
        for url in chat_urls[user]:
            i+=1
            p.apply_async(addLogData, args=(url,db))
    p.close()
    p.join()
    out = db._getvalue()
    outfile = open("rawChat.txt","wb")
    for url in out:
        outfile.write(out[url]+"\n")
Exemple #15
0
def fetch_imagery(image_locations, local_dir):
    pool = Pool(cpu_count())
    tupled = [(loc[0], loc[1], local_dir) for loc in image_locations]
    try:
        pool.map(fetch_imagery_uncurried, tupled)
    finally:
        pool.close()
Exemple #16
0
 def correction_terms_threaded(self):
     '''Finds the correction terms assoctiated to the quadratic form,
     for each of the equivalance classes it finds the maximum by 
     iterating through the relation vectors of the group. 
     
     Uses multiprocessing.'''
     print 'Using multiprocessing'
     pool = Pool() # default: processes=None => uses cpu_count()
     manager = Manager()
     start_time = time.time()
     coef_lists = lrange(self.group.structure)
     # representatives = elements of C_1(V) (np.matrix)
     representatives = map(lambda l: self.find_rep(l), coef_lists)
     # list of maxes        
     lst = manager.list([None for i in xrange(len(representatives))]) 
     alphalist = list(self.get_alpha()) # cannot pickle generators
     pool.map_async(functools.partial(process_alpha_outside, self, 
                                      representatives, lst), alphalist)
     pool.close()
     pool.join() # wait for pool to finish
     # get corrterms via (|alpha|^2+b)/4
     print 'Computed from quadratic form in %g seconds' \
           % (time.time() - start_time)
     return [Fraction(Fraction(alpha, self.int_inverse[1]) + self.b, 4) \
                     for alpha in lst]            
Exemple #17
0
def main():
    if MAIL_TO:
        signal.signal(signal.SIGALRM, send_email_by_alarm)
        signal.alarm(TIME_NOTIFICATION_BY_EMAIL)
        send_email_start()
    start_time = int(time.time())
    manager = Manager()
    queue = manager.Queue()
    pool = Pool(PROCESS_NUMBER + 1)
    jobs = []
    pool.apply_async(listener, args=(queue,))
    for config_file in FINAL_CONFIG_TO_SCRAPE:
        job = pool.apply_async(scraper, (config_file, queue))
        jobs.append(job)
    for i, job in enumerate(jobs):
        job.get()
    # although all job finished, but for unknown some providers still running
    time.sleep(10)
    #extend more time  to make sure there is not any provider running
    for i in range(1000):
        if len(get_summary().provider_running) > 0:
            time.sleep(500)
        else:
            break
    print "Run all has finished"
    queue.put(LISTENER_KILL_SIGNAL)
    pool.close()
    if MAIL_TO:
        send_email_end()
Exemple #18
0
def dirImgProcess(path):
    global workerPool, workerOutput, theGreatIndex
    workerPool = Pool()
    workerOutput = []
    work = []
    theGreatIndex = {}
    pagenumber = 0
    for (dirpath, dirnames, filenames) in os.walk(path):
        for afile in filenames:
            if getImageFileName(afile) is not None:
                pagenumber += 1
                work.append([afile, dirpath, options])
    if GUI:
        GUI.progressBarTick.emit(str(pagenumber))
    if len(work) > 0:
        for i in work:
            workerPool.apply_async(func=fileImgProcess, args=(i, ), callback=fileImgProcess_tick)
        workerPool.close()
        workerPool.join()
        if GUI and not GUI.conversionAlive:
            rmtree(os.path.join(path, '..', '..'), True)
            raise UserWarning("Conversion interrupted.")
        if len(workerOutput) > 0:
            rmtree(os.path.join(path, '..', '..'), True)
            raise RuntimeError("One of workers crashed. Cause: " + workerOutput[0])
    else:
        rmtree(os.path.join(path, '..', '..'), True)
        raise UserWarning("Source directory is empty.")
Exemple #19
0
  def score_all_genes(self, graph, num_procs=1):
    partial_score_gene = partial(score_gene, graph=graph, top_genes=self.top_genes)
    p = Pool(num_procs)
    result = p.map(partial_score_gene, list(self.vd.gene_names()))
    p.close()

    # convert them all to percentiles
    cent_hist = numpy.array([x[1] for x in result if x[1] != -1])
    nn_hist = numpy.array([x[2] for x in result if x[2] != -1])

    batch = []

    for gene, cent_score, nn_score in result:
      # edge case: gene is a top gene
      if gene in self.top_genes:
        cent_perc = 1
        nn_perc = 1
      # edge case: gene isn't in network
      elif cent_score == -1 or \
           nn_score == -1:
        cent_perc = 0
        nn_perc = 0
      else:
        cent_perc = scipy.stats.percentileofscore(cent_hist, cent_score) / 100.0
        nn_perc = 1 - scipy.stats.percentileofscore(nn_hist, nn_score) / 100.0

        print "gene:  %s\n  c:   %s\n  c_p: %s\n  n:   %s\n  n_p: %s" % \
          (gene, cent_score, cent_perc, nn_score, nn_perc)

      batch.append((cent_score, cent_perc, nn_score, nn_perc, gene))

    self.vd._c.executemany("UPDATE genes SET cent_score = ?, cent_perc = ?, " \
      "nn_score = ?, nn_perc = ? WHERE name = ?", batch)
    self.vd._conn.commit()
def train_word2id():
    """把训练集的所有词转成对应的id。"""
    time0 = time.time()
    print('Processing train data.')
    df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 2, 4],
                           names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
    print('training question number %d ' % len(df_train))
    # 没有 content 的问题用 title 来替换
    na_content_indexs = list()
    for i in tqdm(xrange(len(df_train))):
        word_content = df_train.word_content.values[i]
        if type(word_content) is float:
            na_content_indexs.append(i)
    print('There are %d train questions without content.' % len(na_content_indexs))
    for na_index in tqdm(na_content_indexs):
        df_train.at[na_index, 'word_content'] = df_train.at[na_index, 'word_title']
    # 没有 title 的问题, 丢弃
    na_title_indexs = list()
    for i in xrange(len(df_train)):
        word_title = df_train.word_title.values[i]
        if type(word_title) is float:
            na_title_indexs.append(i)
    print('There are %d train questions without title.' % len(na_title_indexs))
    df_train = df_train.drop(na_title_indexs)
    print('After dropping, training question number(should be 2999952) = %d' % len(df_train))
    # 转为 id 形式
    p = Pool()
    train_title = np.asarray(p.map(get_id4words, df_train.word_title.values))
    np.save('../data/wd_train_title.npy', train_title)
    train_content = np.asarray(p.map(get_id4words, df_train.word_content.values))
    np.save('../data/wd_train_content.npy', train_content)
    p.close()
    p.join()
    print('Finished changing the training words to ids. Costed time %g s' % (time.time() - time0))
Exemple #21
0
def multiprocessing_method(n_of_darts, n_of_pools = 10):
	'''Uses 10 processes to split the work of dart_algorithm. Since the processes are independent, \
	I split the number of darts over the number of processes, and used the pool function. Instead of \
	looping over the number of processes, which would be time-consuming, I wrote them explicitly.'''


	pool = Pool(processes=n_of_pools)  
	darts_per_pool = n_of_darts/n_of_pools

	start=time()
	result1 = pool.map_async(dart_algorithm, [darts_per_pool])
	result2 = pool.map_async(dart_algorithm, [darts_per_pool])
	result3 = pool.map_async(dart_algorithm, [darts_per_pool])
	result4 = pool.map_async(dart_algorithm, [darts_per_pool])
	result5 = pool.map_async(dart_algorithm, [darts_per_pool])
	result6 = pool.map_async(dart_algorithm, [darts_per_pool])
	result7 = pool.map_async(dart_algorithm, [darts_per_pool])
	result8 = pool.map_async(dart_algorithm, [darts_per_pool])
	result9 = pool.map_async(dart_algorithm, [darts_per_pool])
	result10 = pool.map_async(dart_algorithm, [darts_per_pool])
	end=time()
	
	Pool_darts_in_circle = result1.get()[0] + result2.get()[0]+result3.get()[0] +\
	result4.get()[0]+result5.get()[0]+result6.get()[0]+ result7.get()[0]+ \
	result8.get()[0]+ result9.get()[0]+ result10.get()[0]
	multiprocessing_time = end-start
	
	pool.close()
	pool.join()
	
	pi = 4 * Pool_darts_in_circle / float(n_of_darts)
	
	# print "multiprocessing pi approximation= ", pi
	
	return multiprocessing_time
Exemple #22
0
def multi_remote_exec_cmd(hosts, username, cmd):
    pool = Pool(processes=5)
    for host in hosts:
        username, password, ip, port = get_connect_item(username, host)
        pool.apply_async(remote_exec_cmd, (ip, port, username, password, cmd))
    pool.close()
    pool.join()
Exemple #23
0
   def get(self, tag="貓咪", max_tag_id=None):
      if tag == "":
         tag = "貓咪"
      p = Pool(10)

      if self.prefix == "ajax":
         medias, next_ = util.search_by_tag(tag, 3, max_tag_id)
      else:
         medias, next_ = util.search_by_tag(tag, 5, max_tag_id)

      fs = p.map(util.features, medias)
      p_label, _, _ = libsvm.svm_predict([1] * len(fs), fs, model)
      for (m, f) in zip(medias, fs):
         print(m["caption"]["text"])
         print(f)
      if self.prefix == "ajax":
         medias = map(lambda (m, l): Media(m, l).__dict__, zip(medias, p_label))
         self.write(json.dumps({
            "max_tag_id": next_,
            "medias": medias
         }))
      else:
         medias = map(lambda (m, l): Media(m, l), zip(medias, p_label))
         if self.prefix == "demo1":
            self.render("demo1.html", medias=medias, tag_name=tag, max_tag_id=next_)
         elif self.prefix == "demo2":
            self.render("demo2.html", medias=medias, tag_name=tag, max_tag_id=next_)
         else:
            self.render("main.html", medias=medias, tag_name=tag, max_tag_id=next_)

      p.close()
      p.join()
def matrix_vector_iteration_by_processes(A,x,k):
	# create a temporary directory to store the matrix and the vectors
	tmpdir = tempfile.mkdtemp()

	nvec = get_nvec(x)
	y = x.copy()

	save_matrix(tmpdir,A)
	for i in xrange(nvec):
		save_x(tmpdir,x,i)

	# start processes
	pool = Pool(processes=min(nvec,6))
	processes = []

	for i in xrange(nvec):
		processes.append( pool.apply_async(matrix_vector_iteration_process, (tmpdir,i,k)) ) 

	# fetch results (vector/matrix shape version)
	if x.ndim  == 1:
		processes[0].get()
		y = load_x(tmpdir,0)
	else:
		for i in xrange(nvec):
			processes[i].get()
			y[:,i] = load_x(tmpdir,i)

	pool.close()

	# remove temporary directory (with all it contains)
	shutil.rmtree(tmpdir)

	return y
Exemple #25
0
def run_train_models(processes, model_library, **kwargs):
    """Train many supervised learning problems in parallel

    model_library = a list specifying the model library for the dataset in
            format needed for TrainModelCV
            **kwargs: all the rest of the input to TrainModelCV"""
    # sample input for model_library:
    #          [[LogisticRegression, classification_error, 'parameters.json', (), {'lam':0.5}],
    #          [LogisticRegression, auc_wmw_fast, None, (), {'C':50}]]

    # use a process pool top execute all the training jobs
    # collect the results and combine to return
    from multiprocessing import Pool

    p = Pool(processes)

    #ret = {}
    #for model in model_library:
    #    p.apply_async(_pool_helper, (model_library, ), kwargs, callback=ret.update)

    results = []
    for model in model_library:
        results.append(p.apply_async(_pool_helper, (model, ), kwargs))

    # wait on the pool to finish
    p.close()
    p.join()

    # collect the results
    ret = {}
    for result in results:
        ret.update(result.get())

    return ret
Exemple #26
0
    def get(self):
        mode = toAlpha3Code(self.get_argument('lang'))
        text = self.get_argument('q')
        if not text:
            self.send_error(400, explanation='Missing q argument')
            return

        def handleCoverage(coverage):
            if coverage is None:
                self.send_error(408, explanation='Request timed out')
            else:
                self.sendResponse([coverage])

        if mode in self.analyzers:
            pool = Pool(processes=1)
            result = pool.apply_async(getCoverage, [text, self.analyzers[mode][0], self.analyzers[mode][1]])
            pool.close()

            @run_async_thread
            def worker(callback):
                try:
                    callback(result.get(timeout=self.timeout))
                except TimeoutError:
                    pool.terminate()
                    callback(None)

            coverage = yield tornado.gen.Task(worker)
            handleCoverage(coverage)
        else:
            self.send_error(400, explanation='That mode is not installed')
def main(path, out, cores):
    """
    Compute contact energies for each pdb in path and write results to 'out'.
    :param path: str
    :param out: str
    :param cores: int
    :return: None
    """
    # Find all pdbs in path
    workload = []
    for file in os.listdir(path):
        if os.path.splitext(file)[1].lower() == ".pdb":
            workload.append(file)
    # Print few newlines to prevent progressbar from messing up the shell
    print("\n\n")
    # Compute energies
    pool = Pool(processes=cores)
    results = []
    for (nr, pdb) in enumerate(workload):
        updateprogress(pdb, nr / len(workload))
        e = computecontactenergy(os.path.join(path, pdb), pool)
        results.append((pdb, e))
    pool.close()
    # Make 100% to appear
    updateprogress("Finished", 1)
    # Store output
    with open(out, "w") as handler:
        handler.write("PDB,Energy in kcal/mol\n")
        for pair in results:
            handler.write("{},{}\n".format(*pair))
Exemple #28
0
    def _run(self, source, destination_format, clear_source=False, workers=-1):
        """
        parallel version of the `convert` method
        :param source: (rdf) files to convert (source path)
        :param destination_format: the destination format
        :param clear_source: if set, delete the source files. Default = False
        :return: None
        """

        files = []
        src = os.path.abspath(source)
        if os.path.isdir(src):
            files = [os.path.join(src, f) for f in os.listdir(src) if to_process(f, destination_format)]
        elif os.path.exists(src):
            files = [src]
        self._log.info('to process: {0}'.format(files))
        if clear_source:
            self._log.warn('will remove original files after conversion')

        def job_finished(res):
            print '.',
            sys.stdout.flush()

        num_cpus = cpu_count()
        num_workers = workers if 0 < workers < num_cpus else num_cpus

        pool = Pool(processes=num_workers)

        for src in files:
            dst = dest_file_name(src, destination_format)
            if dst:
                pool.apply_async(convert_file, (src, dst, clear_source), callback=job_finished)

        pool.close()
        pool.join()
def run_make_submission(settings, targets_and_pipelines, split_ratio):
    pool = Pool(settings.N_jobs)
    for i, (target, pipeline, feature_masks, classifier, classifier_name) in enumerate(targets_and_pipelines):
        for j, feature_mask in enumerate(feature_masks):
            progress_str = 'T=%d/%d M=%d/%d' % (i+1, len(targets_and_pipelines), j+1, len(feature_masks))
            pool.apply_async(make_submission_predictions, [settings, target, pipeline, classifier, classifier_name],
                {'feature_mask': feature_mask, 'progress_str': progress_str, 'quiet': True})
    pool.close()
    pool.join()

    guesses = ['clip,preictal']
    num_masks = None
    classifier_names = []
    for target, pipeline, feature_masks, classifier, classifier_name in targets_and_pipelines:
        classifier_names.append(classifier_name)
        if num_masks is None:
            num_masks = len(feature_masks)
        else:
            assert num_masks == len(feature_masks)

        test_predictions = []

        for feature_mask in feature_masks:
            data = make_submission_predictions(settings, target, pipeline, classifier, classifier_name, feature_mask=feature_mask)
            test_predictions.append(data.mean_predictions)

        predictions = np.mean(test_predictions, axis=0)
        guesses += make_csv_for_target_predictions(target, predictions)

    output = '\n'.join(guesses)
    write_submission_file(settings, output, 'ensemble n=%d split_ratio=%s' % (num_masks, split_ratio), None, str(classifier_names), targets_and_pipelines)
def test_word2id():
    """把测试集的所有词转成对应的id。"""
    time0 = time.time()
    print('Processing eval data.')
    df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t', usecols=[0, 2, 4],
                          names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
    print('test question number %d' % len(df_eval))
    # 没有 title 的问题用 content 来替换
    na_title_indexs = list()
    for i in xrange(len(df_eval)):
        word_title = df_eval.word_title.values[i]
        if type(word_title) is float:
            na_title_indexs.append(i)
    print('There are %d test questions without title.' % len(na_title_indexs))
    for na_index in na_title_indexs:
        df_eval.at[na_index, 'word_title'] = df_eval.at[na_index, 'word_content']
    # 没有 content 的问题用 title 来替换
    na_content_indexs = list()
    for i in tqdm(xrange(len(df_eval))):
        word_content = df_eval.word_content.values[i]
        if type(word_content) is float:
            na_content_indexs.append(i)
    print('There are %d test questions without content.' % len(na_content_indexs))
    for na_index in tqdm(na_content_indexs):
        df_eval.at[na_index, 'word_content'] = df_eval.at[na_index, 'word_title']
    # 转为 id 形式
    p = Pool()
    eval_title = np.asarray(p.map(get_id4words, df_eval.word_title.values))
    np.save('../data/wd_eval_title.npy', eval_title)
    eval_content = np.asarray(p.map(get_id4words, df_eval.word_content.values))
    np.save('../data/wd_eval_content.npy', eval_content)
    p.close()
    p.join()
    print('Finished changing the eval words to ids. Costed time %g s' % (time.time() - time0))
Exemple #31
0
def main():
    # set arguments
    # arguments are passed to classes
    parser = argparse.ArgumentParser(
        description="Evaluate completeness and contamination of a MAG."
    )
    parser.add_argument(
        "genomes", type=str, help="Find marker for these genomes", nargs="+"
    )
    parser.add_argument(
        "--out",
        "-o",
        type=str,
        required=False,
        help="Path to output folder (Default: .)",
        default=".",
    )
    parser.add_argument("--db", type=str, default=None, help="Path to EukCC DB")
    parser.add_argument(
        "--threads", type=int, help="Number of threads to use (Default: 1)", default=1
    )
    parser.add_argument(
        "--tree",
        type=int,
        help="Number of profiles to use at target for tree profiles (default: 30)",
        default=30,
    )
    parser.add_argument(
        "--clade",
        default="base",
        type=str,
        help="Define clade as base, fungi, protozoa or plants (Defaut: base)",
    )
    parser.add_argument(
        "--quiet",
        "-q",
        dest="quiet",
        action="store_true",
        default=False,
        help="Silcence most output",
    )
    parser.add_argument(
        "--debug",
        "-d",
        action="store_true",
        default=False,
        help="Debug and thus ignore safety",
    )
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version="EukCC version {}".format(version.__version__),
    )
    args = parser.parse_args()
    state = eukcc_state(
        workdir=os.path.join(args.out, "refine_workdir"), options=vars(args)
    )
    file.isdir(state["workdir"])

    # define logging
    logLevel = logging.INFO
    if state["quiet"]:
        logLevel = logging.WARNING
    elif state["debug"]:
        logLevel = logging.DEBUG
    logging.basicConfig(
        format="%(asctime)s %(message)s",
        datefmt="%d-%m-%Y %H:%M:%S: ",
        level=logLevel,
    )
    # if db is not set, we check for env variable
    if state["db"] is None:
        if os.environ.get("EUKCC2_DB") is not None:
            state["db"] = os.environ.get("EUKCC2_DB")
            logging.debug(
                "Defined db via env variable EUKCC2_DB as '{}'".format(state["db"])
            )
        else:
            logging.error("No database was provided via --db or EUKCC2_DB env variable")
            exit(202)

    logging.info("EukCC version {}".format(version.__version__))

    logging.info(
        "Looking for shared markers across {} genomes".format(len(state["genomes"]))
    )
    n_per_worker = 4  # using more threads for hmmer makes no sense, so we parallize accroos genomes
    if state["threads"] > (2 * n_per_worker):
        # multithreading pool
        n_processes = math.floor(state["threads"] / n_per_worker)
        logging.info(
            "Launching {} threads with {} threads each".format(
                n_processes, n_per_worker
            )
        )
        pool = Pool(processes=n_processes)
        # change threads not
        opt = {k: v for k, v in state.opt.items()}
        opt["threads"] = n_per_worker
        search_genome_p = partial(search_genome, state=opt)
        data = pool.map(search_genome_p, state["genomes"])
        pool.close()
        pool.join()
    else:
        data = []
        for genome in state["genomes"]:
            data.append(search_genome(genome, state))

    tree_profiles = define_tree_set(data, n_target=args.tree)
    result = find_intersection(data, missing=3)
    outfile = os.path.join(state["out"], "profiles.txt")
    with open(outfile, "w") as fout:
        for key, profiles in result.items():
            for profile in profiles:
                fout.write("{}\t{}\n".format(key, profile))
        for profile in tree_profiles:
            fout.write("{}\t{}\n".format("tree", profile))
    logging.info("wrote profiles to {}".format(outfile))
        Params.append(YearFrom)
        Params.append(MonthFrom)
        Params.append(DayFrom)
        Params.append(YearTo)
        Params.append(MonthTo)
        Params.append(DayTo)
        Params.append(CityCoordinates)
        Params.append(args.ephem)
        Params.append(stdmag)
        Params.append(CityElevation)
        paramToFunction.append(Params)

    if len(paramToFunction) > 0:
        pool = Pool()
        function_output = pool.imap_unordered(calculatePasses, paramToFunction)
        pool.close()
        output = []
        errors = []
        for _ in tqdm.tqdm(function_output, total=len(paramToFunction)):
            if isinstance(_, str):
                errors.append(_)
            else:
                output.append(_)
        pool.join()
        print("--- %s seconds calculating orbits ---" %
              ((time.time() - start_time)))
        if errors is not None:
            if len(errors) > 0:
                with open(ErrorFile, 'w') as f:
                    for item in errors:
                        f.write("%s\n" % item)
Exemple #33
0
def main(argv=None):
    """The main entry-point to salvo."""
    if argv is None:
        argv = sys.argv[1:]

    parser = argparse.ArgumentParser(description='Provision a new salvo.')
    parser.add_argument('config',
                        type=argparse.FileType('r'),
                        help='salvo configuration file to run')
    parser.add_argument('--playbook',
                        '-p',
                        type=argparse.FileType('r'),
                        default='./deploy/playbook.yml',
                        help='directory where playbooks reside')
    parser.add_argument('--wait',
                        '-w',
                        default=False,
                        action='store_true',
                        help='wait for [Enter] before cleaning up')
    parser.add_argument('--deployment',
                        '-d',
                        type=str,
                        default='salvo',
                        help='deployment name for this salvo')
    parser.add_argument('--set',
                        '-s',
                        nargs='*',
                        type=str,
                        help='key:value pair to set for this salvo execution')
    parser.add_argument('--dry-run',
                        '-n',
                        action='store_true',
                        default=False,
                        help='only print what actions would be taken')
    args = parser.parse_args(argv)

    args.set = dict(item.split(":", maxsplit=1)
                    for item in args.set) if args.set is not None else {}
    topology = Topology.load_file(args.config, args.set)

    hq = Cluster('hq', {
        'expose': [22],
    }, {})
    topology.clusters = [hq] + topology.clusters

    agenda.section("Set up network")

    client = boto3.client('ec2')
    ec2 = boto3.resource('ec2')

    # Set up VPC
    agenda.task("Create VPC")
    vpc = client.create_vpc(DryRun=args.dry_run, CidrBlock='10.0.0.0/16')
    vpc = ec2.Vpc(vpc['Vpc']['VpcId'])

    agenda.task("Attach VPC internet gateway")
    gateway = client.create_internet_gateway(DryRun=args.dry_run)
    gateway = ec2.InternetGateway(
        gateway['InternetGateway']['InternetGatewayId'])
    gateway.attach_to_vpc(DryRun=args.dry_run, VpcId=vpc.id)

    agenda.task("Create internet-enabled route table")
    iroutable = vpc.create_route_table(DryRun=args.dry_run)
    iroutable.create_route(DryRun=args.dry_run,
                           DestinationCidrBlock='0.0.0.0/0',
                           GatewayId=gateway.id)

    subnets = []
    secs = []
    for i, c in enumerate(topology.clusters):
        agenda.task("Allocate subnet #{}".format(i + 1))
        subnet = vpc.create_subnet(DryRun=args.dry_run,
                                   CidrBlock='10.0.{}.0/24'.format(i))

        if c.internet:
            agenda.subtask("Hook in internet-enable route table")
            iroutable.associate_with_subnet(DryRun=args.dry_run,
                                            SubnetId=subnet.id)

        # set up security croups
        agenda.subtask("Create network security group")
        sec = vpc.create_security_group(
            DryRun=args.dry_run,
            GroupName='{}-cluster-{}'.format(args.deployment, i + 1),
            Description='Ingress rules for cluster {}-{}'.format(
                args.deployment, c.name))
        # allow all internal traffic
        sec.authorize_ingress(DryRun=args.dry_run,
                              IpProtocol='tcp',
                              FromPort=1,
                              ToPort=65535,
                              CidrIp='10.0.0.0/16')

        if c.expose is not False:
            for p in c.expose:
                agenda.subtask("Allow ingress traffic on port {}".format(p))
                sec.authorize_ingress(DryRun=args.dry_run,
                                      IpProtocol='tcp',
                                      FromPort=p,
                                      ToPort=p,
                                      CidrIp='0.0.0.0/0')

        secs.append(sec)
        subnets.append(subnet)

    # Tag all our VPC resources
    agenda.task("Tag all VPC resources")
    ec2.create_tags(DryRun=args.dry_run,
                    Resources=[
                        vpc.id,
                        gateway.id,
                        iroutable.id,
                    ] + [sn.id for sn in subnets] + [sg.id for sg in secs],
                    Tags=[{
                        'Key': 'salvo',
                        'Value': args.deployment,
                    }])

    # Create access keys
    agenda.task("Generate VPC key pair")
    try:
        keys = client.create_key_pair(DryRun=args.dry_run,
                                      KeyName=args.deployment)
    except botocore.exceptions.ClientError:
        # Key probably already exists. Delete and re-create.
        agenda.subfailure("Could not create key pair")
        agenda.subtask("Attempting to delete old key pair")
        client.delete_key_pair(DryRun=args.dry_run, KeyName=args.deployment)
        agenda.subtask("Attempting to generate new key pair")
        keys = client.create_key_pair(DryRun=args.dry_run,
                                      KeyName=args.deployment)

    keymat = keys['KeyMaterial']
    keys = ec2.KeyPair(keys['KeyName'])

    agenda.section("Launch instances")

    # Launch instances
    clusters = []
    for i, c in enumerate(topology.clusters):
        nics = [{
            "DeviceIndex": 0,
            "Groups": [secs[i].id],
            "SubnetId": subnets[i].id,
            "DeleteOnTermination": True,
            "AssociatePublicIpAddress": c.internet,
        }]

        agenda.task("Launching {} instances in cluster {}".format(
            c.attrs['count'], c.name))
        clusters.append(
            list(
                map(lambda x: ec2.Instance(x), [
                    instance['InstanceId']
                    for instance in client.run_instances(
                        DryRun=args.dry_run,
                        KeyName=keys.name,
                        NetworkInterfaces=nics,
                        ImageId=c.attrs['image'],
                        MinCount=c.attrs['count'],
                        MaxCount=c.attrs['count'],
                        InstanceType=c.attrs['itype'],
                        InstanceInitiatedShutdownBehavior='terminate')
                    ['Instances']
                ])))

    exit = 1
    try:
        agenda.task("Wait for HQ to start running")

        hq = clusters[0][0]
        while hq.state['Name'] == 'pending':
            agenda.subtask("Still in 'pending' state")
            sleep(3)
            hq.load()

        if hq.state['Name'] != 'running':
            agenda.failure(hq.state_reason['Message'])
            raise ChildProcessError(hq.state_reason['Message'])

        def prepare(ci, instance):
            global hq
            print("instance {} in {} now available through {}",
                  instance.private_ip_address, topology.clusters[ci].name,
                  hq.public_ip_address)

        agenda.task("Wait for workers to reach 'running' state")

        done = []
        p = Pool(5)
        pending = True
        while pending:
            pending = False
            for i, cluster in enumerate(clusters):
                for ii, instance in enumerate(cluster):
                    if instance.state['Name'] == 'pending':
                        agenda.subtask(
                            "Instance {}.{} is still pending".format(
                                i + 1, ii + 1))

                        pending = True
                        instance.load()
                        break
                    elif instance.state['Name'] != 'running':
                        agenda.subfailure("Instance {}.{} failed: {}".format(
                            i + 1, ii + 1, instance.state_reason['Message']))
                        raise ChildProcessError(
                            instance.state_reason['Message'])
                    else:
                        # State is now 'running'
                        tag = (i, ii)
                        if tag not in done:
                            # State hasn't been 'running' before
                            done.append(tag)
                            p.apply_async(prepare, [i, instance])
                if pending:
                    break
            sleep(3)
        p.close()
        p.join()

        agenda.task("Wait for HQ to become pingable")

        # Wait for hq to be pingable
        deployment = Deployer(args.playbook.name, topology, keymat, clusters)
        while not deployment.test(hq.public_ip_address):
            sleep(1)

        agenda.task("Wait for workers to become pingable")

        # Wait for workers to be pingable
        for i, cluster in enumerate(clusters):
            for ii, instance in enumerate(cluster):
                while not deployment.test(instance.private_ip_address):
                    sleep(1)

        # Deploy!
        agenda.section("Deploy application")
        exit = deployment.deploy()
    except:
        import traceback
        traceback.print_exc()
    finally:
        agenda.section("Clean up VPC")

        if args.wait:
            agenda.prompt("Press [Enter] when you are ready to clean")
            input()

        # Terminate instances and delete VPC resources
        agenda.task("Terminate all instances")
        instances = list(vpc.instances.all())
        vpc.instances.terminate(DryRun=args.dry_run)
        still_running = True
        while still_running:
            still_running = False
            for i in instances:
                i.load()
                if i.state['Name'] != 'terminated':
                    agenda.subtask("At least one instance still shutting down")
                    still_running = True
                    sleep(3)
                    break

        agenda.task("Delete network resources")
        agenda.subtask("key pair")
        keys.delete(DryRun=args.dry_run)
        agenda.subtask("internet-enabled route associations")
        for r in iroutable.associations.all():
            r.delete(DryRun=args.dry_run)
        agenda.subtask("internet-enabled route table")
        iroutable.delete(DryRun=args.dry_run)
        agenda.subtask("internet gateway")
        gateway.detach_from_vpc(DryRun=args.dry_run, VpcId=vpc.id)
        gateway.delete(DryRun=args.dry_run)
        agenda.subtask("subnets")
        try:
            for sn in subnets:
                sn.delete(DryRun=args.dry_run)
        except:
            agenda.subfailure("failed to delete subnet:")
            import traceback
            traceback.print_exc()
        agenda.subtask("security groups")
        for sg in secs:
            sg.delete()
        agenda.subtask("network interfaces")
        for i in vpc.network_interfaces.all():
            i.delete(DryRun=args.dry_run)

        agenda.task("Delete the VPC")
        vpc.delete(DryRun=args.dry_run)

    return exit
from IGP_scene_prediction import navigation, data_clean, overall_plot
from multiprocessing import Pool
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import math

if __name__ == '__main__':
    agent, observation, time_span = data_clean(300, 540, 12)
    # result, mod = navigation(observation, time_span, 96, 0.01, 100)
    # overall_plot(agent, result)

    p = Pool(4)
    arguments = [(observation, time_span, 12, 0.01, 100),
                 (observation, time_span, 24, 0.01, 100),
                 (observation, time_span, 48, 0.01, 100),
                 (observation, time_span, 96, 0.01, 100)]
    result = p.starmap(navigation, arguments)
    p.close()
    p.join()



def runADMM(G1, sizeOptVar, sizeData, lamb, rho, numiters, x, u, z, a, edgeWeights, useConvex, epsilon, mu):
    # print("a:",a)
    nodes = G1.GetNodes()
    edges = G1.GetEdges()

    maxNonConvexIters = 6 * numiters

    # Find max degree of graph; hash the nodes
    (maxdeg, counter) = (0, 0)
    node2mat = TIntIntH()
    for NI in G1.Nodes():
        maxdeg = np.maximum(maxdeg, NI.GetDeg())
        node2mat.AddDat(NI.GetId(), counter)
        counter = counter + 1

    # Stopping criteria
    eabs = math.pow(10, -2)
    erel = math.pow(10, -3)
    (r, s, epri, edual, counter) = (1, 1, 0, 0, 0)
    A = np.zeros((2 * edges, nodes))
    for EI in G1.Edges():
        A[2 * counter, node2mat.GetDat(EI.GetSrcNId())] = 1
        A[2 * counter + 1, node2mat.GetDat(EI.GetDstNId())] = 1
        counter = counter + 1
    (sqn, sqp) = (math.sqrt(nodes * sizeOptVar), math.sqrt(2 * sizeOptVar * edges))

    # Non-convex case - keeping track of best point so far
    bestx = x
    bestu = u
    bestz = z
    bestObj = 0
    cvxObj = 10000000 * np.ones((1, nodes))
    if(useConvex != 1):
        # Calculate objective
        for i in range(G1.GetNodes()):
            bestObj = bestObj + cvxObj[0, i]
        for EI in G1.Edges():
            weight = edgeWeights.GetDat(TIntPr(EI.GetSrcNId(), EI.GetDstNId()))
            edgeDiff = LA.norm(x[:, node2mat.GetDat(
                EI.GetSrcNId())] - x[:, node2mat.GetDat(EI.GetDstNId())])
            bestObj = bestObj + lamb * weight * \
                math.log(1 + edgeDiff / epsilon)
        initObj = bestObj

    # Run ADMM
    iters = 0
    maxProcesses = 80
    pool = Pool(processes=np.minimum(np.maximum(nodes, edges), maxProcesses))
    # while(iters < numiters and (r > epri or s > edual or iters < 1)):
    dt = []
    obj = []
    tevolution = []
    while(iters < numiters):
        sys.stdout.write('\r'+'network_lasso_cvx_status:'+str(int(100*iters/numiters))+'%')
        # print("iters:",iters)

        # x-update
        neighs = np.zeros(((2 * sizeOptVar + 1) * maxdeg, nodes))
        edgenum = 0
        numSoFar = TIntIntH()
        t0 = time.time()
        for EI in G1.Edges():
            if (not numSoFar.IsKey(EI.GetSrcNId())):
                numSoFar.AddDat(EI.GetSrcNId(), 0)
            counter = node2mat.GetDat(EI.GetSrcNId())
            counter2 = numSoFar.GetDat(EI.GetSrcNId())
            neighs[counter2 * (2 * sizeOptVar + 1), counter] = edgeWeights.GetDat(
                TIntPr(EI.GetSrcNId(), EI.GetDstNId()))
            neighs[counter2 * (2 * sizeOptVar + 1) + 1:counter2 * (2 *
                                                                   sizeOptVar + 1) + (sizeOptVar + 1), counter] = u[:, 2 * edgenum]
            neighs[counter2 * (2 * sizeOptVar + 1) + (sizeOptVar + 1):(counter2 + 1)
                   * (2 * sizeOptVar + 1), counter] = z[:, 2 * edgenum]
            numSoFar.AddDat(EI.GetSrcNId(), counter2 + 1)

            if (not numSoFar.IsKey(EI.GetDstNId())):
                numSoFar.AddDat(EI.GetDstNId(), 0)
            counter = node2mat.GetDat(EI.GetDstNId())
            counter2 = numSoFar.GetDat(EI.GetDstNId())
            neighs[counter2 * (2 * sizeOptVar + 1), counter] = edgeWeights.GetDat(
                TIntPr(EI.GetSrcNId(), EI.GetDstNId()))
            neighs[counter2 * (2 * sizeOptVar + 1) + 1:counter2 * (
                2 * sizeOptVar + 1) + (sizeOptVar + 1), counter] = u[:, 2 * edgenum + 1]
            neighs[counter2 * (2 * sizeOptVar + 1) + (sizeOptVar + 1):(counter2 + 1)
                   * (2 * sizeOptVar + 1), counter] = z[:, 2 * edgenum + 1]
            numSoFar.AddDat(EI.GetDstNId(), counter2 + 1)

            edgenum = edgenum + 1

        temp = np.concatenate((x, a, neighs, np.tile(
            [mu, sizeData, rho, lamb, sizeOptVar], (nodes, 1)).transpose()), axis=0)
        values = pool.map(solveX, temp.transpose())
        newx = np.array(values)[:, 0].tolist()
        newcvxObj = np.array(values)[:, 1].tolist()
        # print("newcvxObj:", newcvxObj)
        # x = np.array(newx).transpose()[0]
        # print("newx:",newx)
        x = np.array(newx).transpose()
        # print("Size of x:",x.shape,"x:",x)

        # cvxObj = np.reshape(np.array(newcvxObj), (-1, nodes))
        # print("cvxObj:",cvxObj)
        # z-update
        ztemp = z.reshape(2 * sizeOptVar, edges, order='F')
        utemp = u.reshape(2 * sizeOptVar, edges, order='F')
        xtemp = np.zeros((sizeOptVar, 2 * edges))
        counter = 0
        weightsList = np.zeros((1, edges))
        for EI in G1.Edges():
            xtemp[:, 2 *
                  counter] = np.array(x[:, node2mat.GetDat(EI.GetSrcNId())])
            xtemp[:, 2 * counter + 1] = x[:, node2mat.GetDat(EI.GetDstNId())]
            weightsList[0, counter] = edgeWeights.GetDat(
                TIntPr(EI.GetSrcNId(), EI.GetDstNId()))
            counter = counter + 1
        xtemp = xtemp.reshape(2 * sizeOptVar, edges, order='F')
        temp = np.concatenate((xtemp, utemp, ztemp, np.reshape(weightsList, (-1, edges)), np.tile(
            [epsilon, useConvex, rho, lamb, sizeOptVar], (edges, 1)).transpose()), axis=0)
        newz = pool.map(solveZ, temp.transpose())
        ztemp = np.array(newz).transpose()[0]
        ztemp = ztemp.reshape(sizeOptVar, 2 * edges, order='F')
        # For dual residual
        s = LA.norm(rho * np.dot(A.transpose(), (ztemp - z).transpose()))
        z = ztemp

        # u-update
        (xtemp, counter) = (np.zeros((sizeOptVar, 2 * edges)), 0)
        for EI in G1.Edges():
            xtemp[:, 2 *
                  counter] = np.array(x[:, node2mat.GetDat(EI.GetSrcNId())])
            xtemp[:, 2 * counter + 1] = x[:, node2mat.GetDat(EI.GetDstNId())]
            counter = counter + 1
        temp = np.concatenate(
            (u, xtemp, z, np.tile(rho, (1, 2 * edges))), axis=0)
        newu = pool.map(solveU, temp.transpose())
        u = np.array(newu).transpose()

        # Update best objective (for non-convex)
        if(useConvex != 1):
            tempObj = 0
            # Calculate objective
            for i in range(G1.GetNodes()):
                tempObj = tempObj + cvxObj[0, i]
            initTemp = tempObj
            for EI in G1.Edges():
                weight = edgeWeights.GetDat(
                    TIntPr(EI.GetSrcNId(), EI.GetDstNId()))
                edgeDiff = LA.norm(x[:, node2mat.GetDat(
                    EI.GetSrcNId())] - x[:, node2mat.GetDat(EI.GetDstNId())])
                tempObj = tempObj + lamb * weight * \
                    math.log(1 + edgeDiff / epsilon)
            # Update best variables
            if(tempObj <= bestObj):
                bestx = x
                bestu = u
                bestz = z
                bestObj = tempObj
                print("Iteration ", iters, "; Obj = ",
                      tempObj, "; Initial = ", initTemp)

            if(iters == numiters - 1 and numiters < maxNonConvexIters):
                if(bestObj == initObj):
                    numiters = numiters + 1

        # Stopping criterion - p19 of ADMM paper
        epri = sqp * eabs + erel * \
            np.maximum(LA.norm(np.dot(A, x.transpose()), 'fro'),
                       LA.norm(z, 'fro'))
        edual = sqn * eabs + erel * \
            LA.norm(np.dot(A.transpose(), u.transpose()), 'fro')
        r = LA.norm(np.dot(A, x.transpose()) - z.transpose(), 'fro')
        s = s

        #print r, epri, s, edual
        t1 = time.time() - t0
        dt.append(t1)

        # objtemp = (LA.norm(x-a))**2+LA.norm(x)
        objtemp = (LA.norm(x - a))**2
        for edge in G1.Edges():
            node1 = edge.GetSrcNId()
            node2 = edge.GetDstNId()
            objtemp = objtemp + lamb * LA.norm(x[:, node1] - x[:, node2])

        obj.append(objtemp)

        iters = iters + 1

    pool.close()
    pool.join()

    objerror = []
    temp = 0

    for k in range(numiters):
        temp = temp + dt[k]
        tevolution.append(temp)

    for k in range(numiters):
        objerror.append(np.absolute(obj[k] - obj[-1]))

    return x, tevolution, obj, objerror
def pmap(f, col):
    pool = Pool(5)
    result = pool.map(f, col)
    pool.close()
    return result
Exemple #37
0
 """
 LoadModel(ifhdnn=False)
 print("New ESOINN model has %d clusters" %
       GPARAMS.Esoinn_setting.Model.class_id)
 os.system("cp *.ESOINN Sfactor.in ./networks")
 Dataer_Process = Process(target=dataer, args=(DataQueue, ))
 Dataer_Process.start()
 TrainerPool = Pool(len(GPARAMS.Compute_setting.Gpulist))
 Resultlist = []
 for i in range(
         max(GPARAMS.Esoinn_setting.Model.class_id,
             GPARAMS.Train_setting.Modelnumperpoint)):
     print("Create HDNN subnet for class %d" % i)
     result = TrainerPool.apply_async(trainer, (DataQueue, GPUQueue))
     Resultlist.append(result)
 TrainerPool.close()
 for i in range(
         max(GPARAMS.Esoinn_setting.Model.class_id,
             GPARAMS.Train_setting.Modelnumperpoint)):
     tmp = Resultlist[i].get()
     print(tmp)
 TrainerPool.terminate()
 TrainerPool.join()
 Dataer_Process.join()
 """
 if os.path.exists(GPARAMS.Compute_setting.Traininglevel):
     os.system("mkdir %s/Stage%d"%(GPARAMS.Compute_setting.Traininglevel,GPARAMS.Train_setting.Trainstage))
     os.system("mv %s/*.record %s/Stage%d"%(GPARAMS.Compute_setting.Traininglevel,\
                                            GPARAMS.Compute_setting.Traininglevel,\
                                            GPARAMS.Train_setting.Trainstage)) 
 for i in range(len(GPARAMS.System_setting)):
Exemple #38
0
def main():  
    global src_port_ids_global
    global dst_port_ids_global
    global port_map_global
    global port_reverse_map_global
    global ntf_global
    global ttf_global
    global DATABASE_FILE
    
    parser = ArgumentParser(description="Generate Test Packets for Internet2")
    parser.add_argument("-p", dest="percentage", type=int,
                      default="100",
                      help="Percentage of test terminals")
    parser.add_argument("-f", dest="filename",
                      default="internet2.sqlite",
                      help="Filename of the database")
    parser.add_argument("-e", action="store_true",
                      default=False,
                      help="Edge port only")
    args = parser.parse_args()
    
    DATABASE_FILE = "work/%s" % args.filename
     
    cs = juniperRouter(1)
    output_port_addition = cs.PORT_TYPE_MULTIPLIER * cs.OUTPUT_PORT_TYPE_CONST
     
    # Load .tf files
    ntf_global = load_internet2_backbone_ntf()
    ttf_global = load_internet2_backbone_ttf()
    (port_map_global, port_reverse_map_global) = load_internet2_backbone_port_to_id_map()
    
    # Initialize the database
    if os.access(DATABASE_FILE, os.F_OK):
        os.remove(DATABASE_FILE)
    
    conn = sqlite3.connect(DATABASE_FILE)
    conn.execute('CREATE TABLE %s (rule TEXT, input_port TEXT, output_port TEXT, action TEXT, file TEXT, line TEXT)' % TABLE_NETWORK_RULES)
    conn.execute('CREATE TABLE %s (rule TEXT, input_port TEXT, output_port TEXT)' % TABLE_TOPOLOGY_RULES)
    conn.execute('CREATE TABLE %s (header TEXT, input_port INTEGER, output_port INTEGER, ports TEXT, no_of_ports INTEGER, rules TEXT, no_of_rules INTEGER)' % TABLE_TEST_PACKETS)
    conn.execute('CREATE TABLE %s (header TEXT, input_port INTEGER, output_port INTEGER, ports TEXT, no_of_ports INTEGER, rules TEXT, no_of_rules INTEGER)' % TABLE_TEST_PACKETS_LOCALLY_COMPRESSED)
    conn.execute('CREATE TABLE %s (rules TEXT, no_of_rules INTEGER)' % TABLE_TEST_PACKETS_GLOBALLY_COMPRESSED)
    conn.execute('CREATE TABLE %s (rule TEXT)' % TABLE_RESULT_RULES)

    rule_count = 0
    for tf in ntf_global.tf_list:
        rule_count += len(tf.rules)
        for rule in tf.rules:
            # print '-------------'
            # print tf.rules[4]
            # exit(1) 
            query = "INSERT INTO %s VALUES (?, ?, ?, ?, ?, ?)" % TABLE_NETWORK_RULES
            conn.execute(query, (rule['id'],' '.join(map(str, rule['in_ports'])), ' '.join(map(str, rule['out_ports'])), rule['action'], rule["file"], ' '.join(map(str, rule["line"]))))
    print "Total Rules: %d" % rule_count
    conn.commit()
    
    rule_count = len(ttf_global.rules) 
    for rule in ttf_global.rules:

        query = "INSERT INTO %s VALUES (?, ?, ?)" % TABLE_TOPOLOGY_RULES 
        conn.execute(query, (rule['id'],' '.join(map(str, rule['in_ports'])), ' '.join(map(str, rule['out_ports']))))  
    print "Total Links: %d" % rule_count
   
    # Generate all ports
    for rtr in port_map_global.keys():
        src_port_ids_global |= set(port_map_global[rtr].values())
    
    print '-------------'
    print port_map_global
    exit(1)    
    
    total_length = len(src_port_ids_global)
    if args.e == True:
        for rule in ttf_global.rules:
            if rule['out_ports'][0] in src_port_ids_global:
                src_port_ids_global.remove(rule['out_ports'][0])    
    
    new_length = len(src_port_ids_global)* args.percentage / 100
    src_port_ids_global = random.sample(src_port_ids_global, new_length)
    print "Total Length: %d" % total_length
    print "New Length: %d" % new_length
    
    for port in src_port_ids_global:
        port += output_port_addition
        dst_port_ids_global.add(port)
    
    #src_port_ids_global = [300013]
    #dst_port_ids_global = [320010]
    
    conn.commit()
    conn.close()
    
    # Run reachability
    start_time = time.time()
    
    pool = Pool()
    result = pool.map_async(find_test_packets, src_port_ids_global)

    # Close
    pool.close()
    pool.join()
    
    end_time = time.time()
    
    test_packet_count = result.get()
    total_paths = sum(test_packet_count)    
    print "========== Before Compression ========="
    print "Total Paths = %d" % total_paths
    print "Average packets per port = %f" % (float(total_paths) / len(src_port_ids_global))
    print "Total Time = %fs" % (end_time - start_time)
    
    #Global Compressing 
    start_time = time.time()
       
    conn = sqlite3.connect(DATABASE_FILE, 6000)    
    result_rule_lists = []
    query = "SELECT rules FROM %s"  % TABLE_TEST_PACKETS_LOCALLY_COMPRESSED
    rows = conn.execute(query)

    for row in rows:
        result_rule_lists.append(row[0].split())
    conn.close()
  
    chunk_size = 80000
    while(True):
        print "Start a new round!"
        conn = sqlite3.connect(DATABASE_FILE, 6000)
        conn.execute('DROP TABLE IF EXISTS %s' % TABLE_SCRATCHPAD)
        conn.execute('CREATE TABLE %s (rules TEXT, no_of_rules INTEGER)' % TABLE_SCRATCHPAD)
        conn.commit()    
        conn.close()
        
        start_len = len(result_rule_lists)
        print start_len
        
        pool = Pool()        
        no_of_chunks = len(result_rule_lists) / chunk_size + 1      
        rule_list_chunks = chunks(result_rule_lists, no_of_chunks)            
        result = pool.map_async(rule_lists_compress, rule_list_chunks)

        # Close
        pool.close()
        pool.join()
        result.get()
        
        print "End of this round."
        
        result_rule_lists = read_rule_lists_from_database(TABLE_SCRATCHPAD)
        
        end_len = len(result_rule_lists)
        if(float(end_len) / float(start_len) > 0.99):
            break

    end_time = time.time()
    
    query = "INSERT INTO %s VALUES (?, ?)" % TABLE_TEST_PACKETS_GLOBALLY_COMPRESSED
    query2 = "INSERT INTO %s VALUES (?)" % TABLE_RESULT_RULES
    
    total_paths = len(result_rule_lists)
    total_length = 0
    
    conn = sqlite3.connect(DATABASE_FILE, 6000)
    conn.execute('DROP TABLE IF EXISTS %s' % TABLE_TEST_PACKETS_GLOBALLY_COMPRESSED)
    conn.execute('CREATE TABLE %s (rules TEXT, no_of_rules INTEGER)' % TABLE_TEST_PACKETS_GLOBALLY_COMPRESSED)

    for rule_list in result_rule_lists:
        total_length += len(rule_list)
        conn.execute(query, (" ".join(rule_list), len(rule_list)))
        for rule in rule_list:
            conn.execute(query2, (rule,))
     
    conn.commit()    
    conn.close()
    
    print "========== After Compression ========="
    print "Total Paths = %d" % total_paths
    print "Average packets per port = %f" % (float(total_paths) / len(src_port_ids_global))
    print "Average length of rule list = %f" % (float(total_length) / total_paths)
    print "Total Time = %fs" % (end_time - start_time)
Exemple #39
0
def map_model_reactions(model1,
                        model2,
                        cpd_pred,
                        nproc=1,
                        outpath='.',
                        log=False,
                        gene=False,
                        compartment_map={}):
    """Map reactions of two models."""
    # Mapping of reactions
    reaction_pairs = len(model1.reactions) * len(model2.reactions)

    # Reaction prior
    # For the prior, use a guesstimate that 95%
    # of the smaller model can be mapped.
    reaction_prior = (0.95 * min(len(model1.reactions), len(
        model2.reactions))) / reaction_pairs

    # Initialize parallel pool of workers
    chunksize = reaction_pairs // nproc
    pool = Pool(nproc)

    # Reaction ID
    # Marginal probability of observing two reactions with the same ids.
    tasks = ((util.id_equals, (r1.id, r2.id)) for r1, r2 in product(
        itervalues(model1.reactions), itervalues(model2.reactions)))
    result = pool.map(parallel_equel, tasks, chunksize=chunksize)
    reaction_id_equal_marg = sum(result) / float(reaction_pairs)

    # Marginal probability of observing two reactions with different ids.
    reaction_id_not_equal_marg = 1.0 - reaction_id_equal_marg

    print('Calculating reaction ID likelihoods...')
    sys.stdout.flush()
    reaction_id_likelihoods = pairwise_likelihood(
        pool, chunksize, model1.reactions, model2.reactions,
        reaction_id_likelihood,
        (reaction_prior, reaction_id_equal_marg, reaction_id_not_equal_marg))

    # Reaction name
    # Marginal probability of observing two reactions with the same name.
    tasks = ((util.name_equals, (r1.name, r2.name)) for r1, r2 in product(
        itervalues(model1.reactions), itervalues(model2.reactions)))
    result = pool.map(parallel_equel, tasks, chunksize=chunksize)
    reaction_name_equal_marg = sum(result) / float(reaction_pairs)

    print('Calculating reaction name likelihoods...')
    sys.stdout.flush()
    reaction_name_likelihoods = pairwise_likelihood(
        pool, chunksize, model1.reactions, model2.reactions,
        reaction_name_likelihood, (reaction_prior, reaction_name_equal_marg))

    # Reaction equation

    print('Calculating reaction equation likelihoods...')
    sys.stdout.flush()
    reaction_equation_likelihoods = pairwise_likelihood(
        pool, chunksize, model1.reactions, model2.reactions,
        reaction_equation_compound_mapping_likelihood,
        (cpd_pred, compartment_map))

    # Reaction genes
    # For each gene, the marginal probability of observing that gene
    # in each model. We use this as an approximation of the probability of
    # observing a pair of genes in two reactions given that the reaction
    # do _not_ match.
    if gene:
        print('Calculating reaction genes likelihoods...')
        sys.stdout.flush()
        reaction_genes_likelihoods = pairwise_likelihood(
            pool, chunksize, model1.reactions, model2.reactions,
            reaction_genes_likelihood, ())
    else:
        reaction_genes_likelihoods = pairwise_likelihood(
            pool, chunksize, model1.reactions, model2.reactions,
            fake_likelihood, ())

    pool.close()
    pool.join()

    if log:
        merge_result = pd.merge(reaction_id_likelihoods,
                                reaction_name_likelihoods,
                                left_index=True,
                                right_index=True,
                                suffixes=('_id', '_name'))
        merge_result = pd.merge(merge_result,
                                reaction_equation_likelihoods,
                                left_index=True,
                                right_index=True,
                                suffixes=('_name', '_equation'))
        merge_result = pd.merge(merge_result,
                                reaction_genes_likelihoods,
                                left_index=True,
                                right_index=True,
                                suffixes=('_equation', '_genes'))

        merge_result.to_csv(outpath + '/reaction_log.tsv', sep='\t')

    all_likelihoods = [
        reaction_id_likelihoods, reaction_name_likelihoods,
        reaction_equation_likelihoods, reaction_genes_likelihoods
    ]

    return (bayes_posterior(reaction_prior,
                            likelihood_products(all_likelihoods)),
            bayes_posterior(reaction_prior, reaction_id_likelihoods),
            bayes_posterior(reaction_prior, reaction_name_likelihoods),
            bayes_posterior(reaction_prior, reaction_equation_likelihoods),
            bayes_posterior(reaction_prior, reaction_genes_likelihoods))
Exemple #40
0
    def readMSMSData(self,msFile,targets,tic_cutoff,frag_cutoff):
        # make DecoID object


        # write temporary peak file
        targets.to_csv(self.uid + ".csv", index=False)

        # read in file and save all spectra
        self.decID.readData(msFile,self.ms2_resolution, True, True, self.ppm, peakDefinitions=self.uid + ".csv", tic_cutoff=tic_cutoff,
                       frag_cutoff=frag_cutoff)

        # structure to hold spectra
        output_dict = {}
        polarity = 0

        # process spectra for each CE
        if len(self.decID.samples) > 0:

            # get charge
            polarity = self.decID.samples[0]["mode"]
            switcher = {"Positive": 1, "Negative": -1}
            polarity = switcher[polarity]

            self.decID.samples = [x for x in self.decID.samples if targets.at[targets.index.values[x["group"]],"Charge"] == polarity]
            samplesAll = deepcopy(self.decID.samples)


            # get unique CEs
            ces = list(set([x["CE"] for x in samplesAll]))
            ces.sort()

            # iterate over CEs

            ceList = []
            args = []
            gs = []
            for ce in ces:

                # parse relevant samples
                self.decID.samples = [x for x in samplesAll if x["CE"] == ce]
                self.decID.label = str(ce)

                groups = list(set([x["group"] for x in self.decID.samples]))

                for g in groups:
                    specs = [x["spectra"] for x in self.decID.samples if x["group"] == g]
                    args.append([specs,self.ms2_resolution])
                    ceList.append(ce)
                    gs.append(g)

            if len(args) > 0:
                p = Pool(min([self.numCores,len(args)]))
                results = p.starmap(sumSpectra,args,chunksize=int(len(args)/min([self.numCores,len(args)])))
                p.close()
                p.join()

                spectra = {(g,ce):spec for g,spec,ce in zip(gs,results,ceList)}


                names = []
                ceList = []
                args = []
                for (ind,ce),spectrum in spectra.items():
                    row = targets.iloc[ind,:]
                    rts = [x["rt"] for x in self.decID.samples if x["group"] == ind]
                    args.append([spectrum,rts,self.decID.ms1,row["mz"],row["rt_start"],row["rt_end"],self.ppm])
                    names.append(row["Name"])
                    ceList.append(ce)

                p = Pool(min([self.numCores,len(args)]))
                results = p.starmap(normalizeSpectrum,args,chunksize=int(len(args)/min([self.numCores,len(args)])))
                p.close()
                p.join()

                for name,ce,spectrum in zip(names,ceList,results):
                    if name not in output_dict:
                        output_dict[name] = {}
                    output_dict[name][ce] = spectrum


        os.remove(self.uid + ".csv")

        return output_dict,polarity
# data = random_class_selection(data)
Y = data["Y"]
X = data["X"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X.values.astype('U')).toarray()

x_train = X
y_train = Y

# kmeans = KMeans(n_clusters=4,n_jobs=-1).fit(x_train)
# print('Score:',metrics.adjusted_rand_score(y_train,kmeans.predict(x_train)))

cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(x_train.T,
                                                 4,
                                                 12,
                                                 error=0.00001,
                                                 maxiter=100000,
                                                 init=None)
print('Score:', metrics.adjusted_rand_score(y_train, np.argmax(u, axis=0)))

docinc = [[] for x in range(4)]
for k in range(x_train.shape[0]):
    # For KMeans clustering, replace the line below with the commented line:
    # docinc[kmeans.predict(x_train[k])[0]]
    docinc[np.argmax(u[:, k])].append(k)

clus = [0, 1, 2, 3, 4, 5, 6, 7]
mpool = Pool(4)
mapped_data = mpool.map(generate_cf_matrix, clus)
mpool.close()
from multiprocessing import Process, Pool
import time


def func(num):
    num += 1
    # print(num)


if __name__ == '__main__':
    p = Pool(5)
    start = time.time()
    p.map(func, [i for i in range(1000)])
    p.close()  # 指不能再向进程池中添加任务
    p.join()  # 等待进程池中的所有任务执行完毕
    print(time.time() - start)

    p_l = []
    start = time.time()
    for i in range(1000):
        p = Process(target=func, args=(i,))
        p.start()
        p_l.append(p)
    [i.join() for i in p_l]
    print(time.time() - start)
Exemple #43
0
def extract_from_shp(lyr, df_xy, id_field=None, n_jobs=0):
    ''' Extract the id of the polygon that each point in df_xy is within'''
    '''ds = ogr.Open(shp)
    lyr = ds.GetLayer()'''

    id_field_given = True
    if not id_field:
        id_field = 'name'
        id_field_given = False

    if n_jobs:
        t1 = time.time()
        args = []
        n_features = lyr.GetFeatureCount()
        for i in xrange(n_features):
            feature = lyr.GetFeature(i)
            geometry = feature.GetGeometryRef()
            geom_coords = stem_conus.get_coords_from_geometry(
                geometry, multipart='split')

            # Initially select only samples that fit within the bounds of the this feature
            min_x, max_x, min_y, max_y = geometry.GetEnvelope()
            xy_temp = df_xy[(df_xy.x >= min_x) & (df_xy.x < max_x) &
                            (df_xy.y > min_y) & (df_xy.y <= max_y)]
            #if id_field_given:
            #    feature_id = feature.GetField(id_field)
            #else:
            feature_id = feature.GetFID()
            args.append(
                [geom_coords, xy_temp, id_field, feature_id, i, n_features])
            feature.Destroy()
            sys.stdout.write(
                '\rInitial filter of points for (%%%.1f) of features' %
                (float(i) / n_features * 100))
            sys.stdout.flush()
        print '\nTime for getting args: %.1f seconds\n' % (time.time() - t1)

        # Predict in parallel
        t1 = time.time()
        pool = Pool(n_jobs)
        points = pool.map(par_within, args, 1)
        pool.close()
        pool.join()
        print '\nTime for extraction: %.1f minutes\n' % (
            (time.time() - t1) / 60)

        t1 = time.time()
        for i, p in points:
            df_xy.ix[p, 'tile_fid'] = i
        print 'Time for adding to df: %.1f seconds\n' % ((time.time() - t1))

    else:
        for i in xrange(lyr.GetFeatureCount()):
            feature = lyr.GetFeature(i)
            geometry = feature.GetGeometryRef()

            #geom_coords = stem_conus.get_coords_from_geometry(geometry)
            min_x, max_x, min_y, max_y = geometry.GetEnvelope()

            # Initially select only samples that fit within the bounds of the this feature
            xy_temp = df_xy[(df_xy.x >= min_x) & (df_xy.x < max_x) &
                            (df_xy.y > min_y) & (df_xy.y <= max_y)]

            # With square tiles, this is not necessary
            #points = [i for i, (x, y) in xy_temp[['x','y']].iterrows() if within(x, y, geometry)]

            if id_field_given:
                feature_id = feature.GetField(id_field)
            else:
                feature_id = feature.GetFID()
            df_xy.ix[xy_temp.index, 'tile_fid'] = feature_id
            feature.Destroy()

    return df_xy
Exemple #44
0
def map_model_compounds(model1,
                        model2,
                        nproc=1,
                        outpath='.',
                        log=False,
                        kegg=False):
    """Map compounds of two models."""
    compound_pairs = len(model1.compounds) * len(model2.compounds)

    # Compound prior
    # For the prior, use a guesstimate that 95% of the
    # smaller model can be mapped.
    compound_prior = (0.95 * min(len(model1.compounds), len(
        model2.compounds))) / compound_pairs

    # Initialize parallel pool of workers
    chunksize = compound_pairs // nproc
    pool = Pool(nproc)
    # Compound ID
    # Marginal probability of observing two equal compound IDs
    tasks = ((util.id_equals, (c1.id, c2.id)) for c1, c2 in product(
        itervalues(model1.compounds), itervalues(model2.compounds)))
    result = pool.map(parallel_equel, tasks, chunksize=chunksize)
    compound_id_marg = sum(result) / float(compound_pairs)

    print('Calculating compound ID likelihoods...')
    sys.stdout.flush()
    compound_id_likelihoods = pairwise_likelihood(
        pool, chunksize, model1.compounds, model2.compounds,
        compound_id_likelihood, (compound_prior, compound_id_marg))

    # Compound name
    # Marginal probability of observing two similar names
    tasks = ((util.name_equals, (c1.name, c2.name)) for c1, c2 in product(
        itervalues(model1.compounds), itervalues(model2.compounds)))
    result = pool.map(parallel_equel, tasks, chunksize=chunksize)
    compound_name_marg = sum(result) / float(compound_pairs)

    print('Calculating compound name likelihoods...')
    sys.stdout.flush()
    compound_name_likelihoods = pairwise_likelihood(
        pool, chunksize, model1.compounds, model2.compounds,
        compound_name_likelihood, (compound_prior, compound_name_marg))

    # Compound charge
    # Marginal probability of observing two compounds with the same charge
    compound_charge_equal_marg = sum(
        c1.charge is not None and c2.charge is not None
        and c1.charge == c2.charge
        for c1, c2 in product(itervalues(model1.compounds),
                              itervalues(model2.compounds))) / compound_pairs

    # Marginal probability of observing two compounds with different charge
    compound_charge_not_equal_marg = sum(
        c1.charge is not None and c2.charge is not None
        and c1.charge != c2.charge
        for c1, c2 in product(itervalues(model1.compounds),
                              itervalues(model2.compounds))) / compound_pairs

    print('Calculating compound charge likelihoods...')
    sys.stdout.flush()

    compound_charge_likelihoods = pairwise_likelihood(
        pool, chunksize, model1.compounds, model2.compounds,
        compound_charge_likelihood,
        (compound_prior, compound_charge_equal_marg,
         compound_charge_not_equal_marg))

    # Compound formula
    # Marginal probability of observing two compounds with the same formula
    tasks = ((util.formula_equals, (c1.formula, c2.formula, c1.charge,
                                    c2.charge))
             for c1, c2 in product(itervalues(model1.compounds),
                                   itervalues(model2.compounds)))
    result = pool.map(parallel_equel, tasks, chunksize=chunksize)
    compound_formula_equal_marg = sum(result) / float(compound_pairs)

    # Marginal probability of observing two compounds with different formula
    compound_formula_not_equal_marg = 1.0 - compound_formula_equal_marg - (sum(
        c1.formula is None or c2.formula is None
        for c1, c2 in product(itervalues(model1.compounds),
                              itervalues(model2.compounds))) / compound_pairs)

    print('Calculating compound formula likelihoods...')
    sys.stdout.flush()
    compound_formula_likelihoods = pairwise_likelihood(
        pool, chunksize, model1.compounds, model2.compounds,
        compound_formula_likelihood,
        (compound_prior, compound_formula_equal_marg,
         compound_formula_not_equal_marg))

    # Compound KEGG id
    if kegg:  # run KEGG id mapping
        # Marginal probability of observing two compounds
        # where KEGG ids are equal
        compound_kegg_equal_marg = sum(
            c1.kegg is not None and c2.kegg is not None and c1.kegg == c2.kegg
            for c1, c2 in product(itervalues(model1.compounds),
                                  itervalues(
                                      model2.compounds))) / compound_pairs

        # Marginal probability of observing two compounds
        # where KEGG ids are different
        compound_kegg_not_equal_marg = sum(
            c1.kegg is not None and c2.kegg is not None and c1.kegg != c2.kegg
            for c1, c2 in product(itervalues(model1.compounds),
                                  itervalues(
                                      model2.compounds))) / compound_pairs

        print('Calculating compound KEGG ID likelihoods...')
        sys.stdout.flush()
        compound_kegg_likelihoods = pairwise_likelihood(
            pool, chunksize, model1.compounds, model2.compounds,
            compound_kegg_likelihood,
            (compound_prior, compound_kegg_equal_marg,
             compound_kegg_not_equal_marg))
    else:  # run fake mapping
        compound_kegg_likelihoods = pairwise_likelihood(
            pool, chunksize, model1.compounds, model2.compounds,
            fake_likelihood, ())

    pool.close()
    pool.join()

    if log:
        merge_result = pd.merge(compound_id_likelihoods,
                                compound_name_likelihoods,
                                left_index=True,
                                right_index=True,
                                suffixes=('_id', '_name'))
        merge_result = pd.merge(merge_result,
                                compound_charge_likelihoods,
                                left_index=True,
                                right_index=True,
                                suffixes=('_name', '_charge'))
        merge_result = pd.merge(merge_result,
                                compound_formula_likelihoods,
                                left_index=True,
                                right_index=True,
                                suffixes=('_charge', '_formula'))
        merge_result = pd.merge(merge_result,
                                compound_kegg_likelihoods,
                                left_index=True,
                                right_index=True,
                                suffixes=('_formula', '_kegg'))

        merge_result.to_csv(outpath + '/compound_log.tsv', sep='\t')

    all_likelihoods = [
        compound_id_likelihoods, compound_name_likelihoods,
        compound_charge_likelihoods, compound_formula_likelihoods,
        compound_kegg_likelihoods
    ]

    return (bayes_posterior(compound_prior,
                            likelihood_products(all_likelihoods)),
            bayes_posterior(compound_prior, compound_id_likelihoods),
            bayes_posterior(compound_prior, compound_name_likelihoods),
            bayes_posterior(compound_prior, compound_charge_likelihoods),
            bayes_posterior(compound_prior, compound_formula_likelihoods),
            bayes_posterior(compound_prior, compound_kegg_likelihoods))
Exemple #45
0
def aovhm_periodfind(
        times,
        mags,
        errs,
        nharmonics=6,
        magsarefluxes=False,
        autofreq=True,
        startp=None,
        endp=None,
        normalize=True,
        stepsize=1.0e-4,
        nbestpeaks=5,
        periodepsilon=0.1,  # 0.1
        sigclip=10.0,
        nworkers=None,
        verbose=True):
    '''This runs a parallel AoV period search.

    NOTE: normalize = True here as recommended by Schwarzenberg-Czerny 1996,
    i.e. mags will be normalized to zero and rescaled so their variance = 1.0

    '''

    # get rid of nans first and sigclip
    stimes, smags, serrs = sigclip_magseries(times,
                                             mags,
                                             errs,
                                             magsarefluxes=magsarefluxes,
                                             sigclip=sigclip)

    # make sure there are enough points to calculate a spectrum
    if len(stimes) > 9 and len(smags) > 9 and len(serrs) > 9:

        # get the frequencies to use
        if startp:
            endf = 1.0 / startp
        else:
            # default start period is 0.1 day
            endf = 1.0 / 0.1

        if endp:
            startf = 1.0 / endp
        else:
            # default end period is length of time series
            startf = 1.0 / (stimes.max() - stimes.min())

        # if we're not using autofreq, then use the provided frequencies
        if not autofreq:
            frequencies = np.arange(startf, endf, stepsize)
            if verbose:
                LOGINFO(
                    'using %s frequency points, start P = %.3f, end P = %.3f' %
                    (frequencies.size, 1.0 / endf, 1.0 / startf))
        else:
            # this gets an automatic grid of frequencies to use
            frequencies = get_frequency_grid(stimes,
                                             minfreq=startf,
                                             maxfreq=endf)
            if verbose:
                LOGINFO('using autofreq with %s frequency points, '
                        'start P = %.3f, end P = %.3f' %
                        (frequencies.size, 1.0 / frequencies.max(),
                         1.0 / frequencies.min()))

        # map to parallel workers
        if (not nworkers) or (nworkers > NCPUS):
            nworkers = NCPUS
            if verbose:
                LOGINFO('using %s workers...' % nworkers)

        pool = Pool(nworkers)

        # renormalize the working mags to zero and scale them so that the
        # variance = 1 for use with our LSP functions
        if normalize:
            nmags = (smags - npmedian(smags)) / npstd(smags)
        else:
            nmags = smags

        # figure out the weighted variance
        # www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf
        magvariance_top = npsum(nmags / (serrs * serrs))
        magvariance_bot = (nmags.size - 1) * npsum(
            1.0 / (serrs * serrs)) / nmags.size
        magvariance = magvariance_top / magvariance_bot

        tasks = [(stimes, nmags, serrs, x, nharmonics, magvariance)
                 for x in frequencies]

        lsp = pool.map(aovhm_theta_worker, tasks)

        pool.close()
        pool.join()
        del pool

        lsp = nparray(lsp)
        periods = 1.0 / frequencies

        # find the nbestpeaks for the periodogram: 1. sort the lsp array by
        # highest value first 2. go down the values until we find five
        # values that are separated by at least periodepsilon in period

        # make sure to filter out non-finite values
        finitepeakind = npisfinite(lsp)
        finlsp = lsp[finitepeakind]
        finperiods = periods[finitepeakind]

        # make sure that finlsp has finite values before we work on it
        try:

            bestperiodind = npargmax(finlsp)

        except ValueError:

            LOGERROR('no finite periodogram values '
                     'for this mag series, skipping...')
            return {
                'bestperiod': npnan,
                'bestlspval': npnan,
                'nbestpeaks': nbestpeaks,
                'nbestlspvals': None,
                'nbestperiods': None,
                'lspvals': None,
                'periods': None,
                'method': 'mav',
                'kwargs': {
                    'startp': startp,
                    'endp': endp,
                    'stepsize': stepsize,
                    'normalize': normalize,
                    'nharmonics': nharmonics,
                    'autofreq': autofreq,
                    'periodepsilon': periodepsilon,
                    'nbestpeaks': nbestpeaks,
                    'sigclip': sigclip
                }
            }

        sortedlspind = np.argsort(finlsp)[::-1]
        sortedlspperiods = finperiods[sortedlspind]
        sortedlspvals = finlsp[sortedlspind]

        prevbestlspval = sortedlspvals[0]
        # now get the nbestpeaks
        nbestperiods, nbestlspvals, peakcount = ([finperiods[bestperiodind]],
                                                 [finlsp[bestperiodind]], 1)
        prevperiod = sortedlspperiods[0]

        # find the best nbestpeaks in the lsp and their periods
        for period, lspval in zip(sortedlspperiods, sortedlspvals):

            if peakcount == nbestpeaks:
                break
            perioddiff = abs(period - prevperiod)
            bestperiodsdiff = [abs(period - x) for x in nbestperiods]

            # print('prevperiod = %s, thisperiod = %s, '
            #       'perioddiff = %s, peakcount = %s' %
            #       (prevperiod, period, perioddiff, peakcount))

            # this ensures that this period is different from the last
            # period and from all the other existing best periods by
            # periodepsilon to make sure we jump to an entire different peak
            # in the periodogram
            if (perioddiff > (periodepsilon * prevperiod)
                    and all(x > (periodepsilon * prevperiod)
                            for x in bestperiodsdiff)):
                nbestperiods.append(period)
                nbestlspvals.append(lspval)
                peakcount = peakcount + 1

            prevperiod = period

        return {
            'bestperiod': finperiods[bestperiodind],
            'bestlspval': finlsp[bestperiodind],
            'nbestpeaks': nbestpeaks,
            'nbestlspvals': nbestlspvals,
            'nbestperiods': nbestperiods,
            'lspvals': lsp,
            'periods': periods,
            'method': 'mav',
            'kwargs': {
                'startp': startp,
                'endp': endp,
                'stepsize': stepsize,
                'normalize': normalize,
                'nharmonics': nharmonics,
                'autofreq': autofreq,
                'periodepsilon': periodepsilon,
                'nbestpeaks': nbestpeaks,
                'sigclip': sigclip
            }
        }

    else:

        LOGERROR('no good detections for these times and mags, skipping...')
        return {
            'bestperiod': npnan,
            'bestlspval': npnan,
            'nbestpeaks': nbestpeaks,
            'nbestlspvals': None,
            'nbestperiods': None,
            'lspvals': None,
            'periods': None,
            'method': 'mav',
            'kwargs': {
                'startp': startp,
                'endp': endp,
                'stepsize': stepsize,
                'normalize': normalize,
                'nharmonics': nharmonics,
                'autofreq': autofreq,
                'periodepsilon': periodepsilon,
                'nbestpeaks': nbestpeaks,
                'sigclip': sigclip
            }
        }
Exemple #46
0
def extract_var(year,
                var_name,
                by_tile,
                data_band,
                data_type,
                df_tile,
                df_xy,
                basepath,
                search_str,
                path_filter,
                mosaic_tx,
                file_count,
                n_files,
                nodata=None,
                kernel=False):
    '''
    Return a dataframe of 
    '''
    t0 = time.time()
    dfs = []  # For storing kernel and stats
    var_col = var_name  # + str(year)
    file_col = 'file_' + var_col
    #file_count = last_file
    # Store the filepath for each tile
    if by_tile:
        df_tile[file_col] = [
            find_file(basepath, search_str.format(year), tile, path_filter)
            for tile in df_tile.tile_id
        ]
    else:
        df_tile[file_col] = find_file(basepath,
                                      search_str.format(year),
                                      path_filter=path_filter)
    # Handle any rows for for which the file is null
    if df_tile[file_col].isnull().any():
        df_null = df_tile[df_tile[file_col].isnull()]
        print 'Tiles excluded from extractions for %s from %s:' % (var_name,
                                                                   year)
        for ind, row in df_null.iterrows():
            print row['tile_str']
        print ''
        n_null = len(df_null)
        # Make the file name a unique integer so that it can be
        #   distinguished from real files and from other null files
        df_tile.loc[df_null.index, file_col] = range(n_null)

    # Get the file string for each xy for each year
    df_xy[file_col] = ''  # Creates the column but keeps it empty
    for tile in df_xy.tile_id.unique():
        try:
            df_xy.loc[df_xy['tile_id'] == tile,
                      file_col] = df_tile.loc[df_tile['tile_id'] == tile,
                                              file_col].values[0]
        except:
            import pdb
            pdb.set_trace()

    # For each file, get the dataset as an array and extract all values at each row col
    val_cols = ['%s_%s' % (var_col, i) for i in range(1, 10)]
    '''for f in df_tile[file_col].unique():
        print 'Extracting for array %s of approximately %s from:\n%s\n'\
        % (last_file, n_files, f)
        dfs.append(extract_by_rowcol(df_xy, f, file_col, var_col, data_band,
                                     mosaic_tx, val_cols, data_type, nodata,
                                     kernel
                                     ))
        file_count += 1'''
    args = []
    for i, f in enumerate(df_tile[file_col].unique()):
        args.append([
            df_xy, f, file_col, var_col, data_band, mosaic_tx, val_cols,
            data_type, nodata, kernel, i + 1 + file_count
        ])
    n_jobs = 10
    pool = Pool(n_jobs)
    this_count = len(args)
    print 'Extracting from %s-%s files of %s...' % (file_count, file_count +
                                                    this_count, n_files)
    dfs = pool.map(par_extract_by_rowcol, args, 1)
    pool.close()
    pool.join()
    print '\nTime for this variable: %.1f minutes\n' % (
        (time.time() - t0) / 60)
    file_count += this_count

    # Comnbine all the pieces for this year
    df_var = pd.concat(dfs)

    return df_var, file_count
Exemple #47
0
def apply_by_multiprocessing_list_to_list(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = Pool(processes=workers)
    result = pool.map(apply_list, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    return result
Exemple #48
0
def run(**kwargs):
    if len(kwargs.get("pipeline")) == 0:
        raise TLException("pipeline command must be given.")

    parallel_count = int(kwargs['parallel_count'])
    input_files = kwargs["input"]
    running_configs = []
    gpu_resources = kwargs.get("gpu_resources")

    # setup the running config
    pipeline_cleaned = kwargs['pipeline']
    for i, each in enumerate(input_files):
        each_config = {
            "input": each,
            "command": pipeline_cleaned,
            "output_folder": kwargs.get("output_folder"),
            "output_name": kwargs.get("output_name"),
            "ground_truth_pattern": kwargs.get("ground_truth_pattern"),
            "ground_truth_directory": kwargs.get("ground_truth_directory", ""),
            "score_column": kwargs.get("score_column"),
            "debug": kwargs.get("debug", False)
        }
        running_configs.append(each_config)
        if len(gpu_resources) > 0:
            each_config["gpu_id"] = gpu_resources[i % len(gpu_resources)]
        else:
            each_config["gpu_id"] = None

    # start running
    try:
        from multiprocessing import Pool
        from tqdm import tqdm
        import time
        import pandas as pd
        from io import StringIO
        from tl.utility.run_pipelines_utility import PipelineUtility
        if parallel_count == 1:
            results = []
            for each in tqdm(running_configs):
                results.append(PipelineUtility.run_one_pipeline(each))
        else:
            from multiprocessing import set_start_method
            set_start_method("spawn")

            # use multiprocess pool function to run in parallel mode
            p = Pool(parallel_count)
            result = p.map_async(PipelineUtility.run_one_pipeline,
                                 running_configs)
            pbar = tqdm(total=len(running_configs))
            previous_remain = len(running_configs)
            while not result.ready():
                remain_job = result._number_left
                if remain_job != previous_remain:
                    pbar.update(previous_remain - remain_job)
                    previous_remain = remain_job
                time.sleep(2)
            pbar.close()
            results = result.get()
            p.close()
            p.join()

        PipelineUtility.print_pipeline_running_results(
            results,
            omit_header=kwargs['omit_headers'],
            input_files=input_files,
            tag=kwargs.get('tag'))
    except:
        message = 'Command: run-pipeline\n'
        message += 'Error Message:  {}\n'.format(traceback.format_exc())
        raise TLException(message)
Exemple #49
0
class MapWrapper:
    """
    Parallelisation wrapper for working with map-like callables, such as
    `multiprocessing.Pool.map`.

    Parameters
    ----------
    pool : int or map-like callable
        If `pool` is an integer, then it specifies the number of threads to
        use for parallelization. If ``int(pool) == 1``, then no parallel
        processing is used and the map builtin is used.
        If ``pool == -1``, then the pool will utilize all available CPUs.
        If `pool` is a map-like callable that follows the same
        calling sequence as the built-in map function, then this callable is
        used for parallelization.
    """
    def __init__(self, pool=1):
        self.pool = None
        self._mapfunc = map
        self._own_pool = False

        if callable(pool):
            self.pool = pool
            self._mapfunc = self.pool
        else:
            from multiprocessing import Pool
            # user supplies a number
            if int(pool) == -1:
                # use as many processors as possible
                self.pool = Pool()
                self._mapfunc = self.pool.map
                self._own_pool = True
            elif int(pool) == 1:
                pass
            elif int(pool) > 1:
                # use the number of processors requested
                self.pool = Pool(processes=int(pool))
                self._mapfunc = self.pool.map
                self._own_pool = True
            else:
                raise RuntimeError("Number of workers specified must be -1,"
                                   " an int >= 1, or an object with a 'map' "
                                   "method")

    def __enter__(self):
        return self

    def terminate(self):
        if self._own_pool:
            self.pool.terminate()

    def join(self):
        if self._own_pool:
            self.pool.join()

    def close(self):
        if self._own_pool:
            self.pool.close()

    def __exit__(self, exc_type, exc_value, traceback):
        if self._own_pool:
            self.pool.close()
            self.pool.terminate()

    def __call__(self, func, iterable):
        # only accept one iterable because that's all Pool.map accepts
        try:
            return self._mapfunc(func, iterable)
        except TypeError as e:
            # wrong number of arguments
            raise TypeError("The map-like callable must be of the"
                            " form f(func, iterable)") from e
def create_threads():
    pool = Pool()
    results = pool.map(get_rosters, get_roster_links())
    pool.close()
    pool.join()
    return results
Exemple #51
0
 def download_video_page_async_multi_process(self, process_num=10):
     pool = Pool(processes=process_num)
     for line in range(process_num):
         pool.apply_async(self.download_video_page_async_single_process)
     pool.close()
     pool.join()
Exemple #52
0
def gen_html_report(param_dct, output_df, lipid_info_img_lst):

    usr_vendor = param_dct['vendor']
    output_folder = param_dct['img_output_folder_str']
    usr_ms1_ppm = param_dct['ms_ppm']
    usr_ms2_ppm = param_dct['ms2_ppm']
    usr_ms1_precision = usr_ms1_ppm * 1e-6
    usr_ms2_precision = usr_ms2_ppm * 1e-6
    usr_core_num = param_dct['core_number']
    usr_dpi = param_dct['img_dpi']
    usr_img_type = param_dct['img_type']
    hunter_start_time_str = param_dct['hunter_start_time']

    # keep stay in current working directory
    current_path = os.getcwd()
    if os.path.isdir(output_folder):
        os.chdir(output_folder)
        if os.path.isdir('LipidHunter_Results_Figures_%s' %
                         hunter_start_time_str):
            print('[INFO] --> Output folder existed...')
        else:
            os.mkdir('LipidHunter_Results_Figures_%s' % hunter_start_time_str)
            print('[INFO] --> Output folder created...')
    else:
        os.mkdir(output_folder)
        os.chdir(output_folder)
        os.mkdir('LipidHunter_Results_Figures_%s' % hunter_start_time_str)
        print('[INFO] --> Output folder created...')
    os.chdir(current_path)

    # generate html files
    log_pager = LogPageCreator(output_folder, hunter_start_time_str, param_dct)
    log_pager.add_all_info(output_df)
    log_pager.close_page()
    # del log_pager
    print('[STATUS] >>> start to generate images: image count %i' %
          len(lipid_info_img_lst))

    if usr_core_num > 1:
        parallel_pool = Pool(usr_core_num)
        img_num = len(lipid_info_img_lst)
        img_sub_len = int(math.ceil(img_num / usr_core_num))
        img_sub_key_lst = [
            lipid_info_img_lst[k:k + img_sub_len]
            for k in range(0, img_num, img_sub_len)
        ]

        worker_count = 1
        for img_sub_lst in img_sub_key_lst:
            if isinstance(img_sub_lst, tuple) or isinstance(img_sub_lst, list):
                if None in img_sub_lst:
                    img_sub_lst = [x for x in img_sub_lst if x is not None]
                else:
                    pass
                # img_params_dct = {'lipid_info_img_lst': img_sub_lst, 'usr_core_num': usr_core_num,
                #                   'usr_img_type': usr_img_type, 'usr_dpi': usr_dpi, 'usr_vendor': usr_vendor,
                #                   'usr_ms1_precision': usr_ms1_precision, 'worker_count': worker_count}

                if len(img_sub_lst) > 0:
                    print(
                        '[STATUS] >>> Core #%i ==> Generating output images ... image count: %i'
                        % (worker_count, len(img_sub_lst)))
                    if 'debug_mode' in list(param_dct.keys()):
                        if param_dct['debug_mode'] == 'ON':
                            for img_param_dct in img_sub_lst:
                                print(img_param_dct['save_img_as'])
                    parallel_pool.apply_async(
                        gen_plot,
                        args=(img_sub_lst, worker_count, usr_img_type, usr_dpi,
                              usr_vendor, usr_ms1_precision))
                    worker_count += 1
        # del img_sub_key_lst
        # del img_sub_lst
        parallel_pool.close()
        parallel_pool.join()

    else:
        worker_count = 1
        print('[INFO] --> Using single core mode...')
        if isinstance(lipid_info_img_lst, tuple) or isinstance(
                lipid_info_img_lst, list):
            if None in lipid_info_img_lst:
                lipid_info_img_lst = [
                    x for x in lipid_info_img_lst if x is not None
                ]
            else:
                pass
            if len(lipid_info_img_lst) > 0:
                gen_plot(lipid_info_img_lst, worker_count, usr_img_type,
                         usr_dpi, usr_vendor, usr_ms1_precision)
Exemple #53
0
def analyse(config, prog, argv):
    parser = argparse.ArgumentParser(prog=prog,
                                     description='Analyse patch stacks')

    # thresholds
    parser.add_argument('-th',
                        dest='thres_heading',
                        metavar='threshold',
                        default=config.thresholds.heading,
                        type=float,
                        help='Minimum diff hunk section heading similarity '
                        '(default: %(default)s)')
    parser.add_argument('-tf',
                        dest='thres_filename',
                        metavar='threshold',
                        default=config.thresholds.filename,
                        type=float,
                        help='Minimum filename similarity '
                        '(default: %(default)s)')
    parser.add_argument(
        '-dlr',
        dest='thres_diff_lines',
        metavar='threshold',
        type=float,
        default=config.thresholds.diff_lines_ratio,
        help='Diff lines ratio threshold (default: %(default)s)')
    parser.add_argument('-adi',
                        dest='thres_adi',
                        metavar='days',
                        type=int,
                        default=config.thresholds.author_date_interval,
                        help='Author date interval (default: %(default)s)')

    parser.add_argument('-er',
                        dest='er_filename',
                        metavar='filename',
                        default=config.f_evaluation_result,
                        help='Evaluation result PKL filename')

    parser.add_argument('-cpu',
                        dest='cpu_factor',
                        metavar='cpu',
                        type=float,
                        default=1.0,
                        help='CPU factor for parallelisation '
                        '(default: %(default)s)')

    parser.add_argument('-linux',
                        dest='linux',
                        action='store_true',
                        default=False,
                        help='Make a Linux kernel specific analysis')

    # choose analysis mode
    parser.add_argument('mode',
                        default='succ',
                        choices=['succ', 'rep', 'upstream'],
                        help='rep: '
                        'compare representatives of the stack - '
                        'succ: '
                        'compare successive versions of the stacks - '
                        'upstream: '
                        'compare representatives against upstream - '
                        '(default: %(default)s)')

    parser.add_argument('-upstream',
                        dest='upstream_range',
                        metavar='<revision range>',
                        default=None,
                        help='Specify upstream revision range, '
                        'e.g.: v0.1..v0.2 (default: %s)' %
                        config.upstream_range)

    parser.add_argument('-differential',
                        dest='differential',
                        action='store_true',
                        default=False,
                        help='Perform a differential analysis')

    args = parser.parse_args(argv)

    config.thresholds.heading = args.thres_heading
    config.thresholds.filename = args.thres_filename
    config.thresholds.diff_lines_ratio = args.thres_diff_lines
    config.thresholds.author_date_interval = args.thres_adi

    repo = config.repo
    mbox = config.mode == Config.Mode.MBOX
    mode = args.mode

    if mbox and mode == 'succ':
        log.error('Analysis mode succ is not available in mailbox mode!')
        return -1

    if not mbox and args.differential:
        log.error(
            'Differential analysis can only be performed in mailbox mode')

    f_cluster, cluster = config.load_cluster(must_exist=False)

    def fill_result(hashes, tag):
        for hash in hashes:
            cluster.insert_element(hash)
            if tag:
                cluster.mark_upstream(hash, True)

        # intermediate persistence
        cluster.to_file(f_cluster)

    if mbox:
        log.info('Regarding mails in time window %s--%s' % (format_date_ymd(
            config.mbox_mindate), format_date_ymd(config.mbox_maxdate)))
        # load mbox ccache very early, because we need it in any case if it
        # exists.
        config.load_ccache_mbox()

        new_patches = set()
        if mode == 'rep':
            victims = repo.mbox.get_ids(config.mbox_time_window)

            # we have to temporarily cache those commits to filter out invalid
            # emails. Commit cache is already loaded, so evict everything except
            # victims and then cache all victims.
            repo.cache_evict_except(victims)
            repo.cache_commits(victims)

            # we might have loaded invalid emails, so reload the victim list once
            # more. This time, include all patches from the pre-existing (partial)
            # result, and check if all patches are reachable
            victims = repo.mbox.get_ids(config.mbox_time_window) | \
                    cluster.get_downstream()

            # in case of an mbox analysis, we will definitely need all untagged
            # commit hashes as we need to determine the representative system for
            # both modes, rep and upstream.
            available = repo.cache_commits(victims)
            if available != victims:
                missing = victims - available
                log.warning('MAILBOX RESULT CONTAINS %d MESSAGES THAT ARE NOT '
                            'REACHABLE BY THE MAILBOX CONFIGURATION' %
                            len(missing))
                log.warning('Those messages will be removed from the result')
                log.warning(
                    'Waiting 5 seconds before starting. Press Ctrl-C to '
                    'abort.')
                sleep(5)
                for miss in missing:
                    cluster.remove_element(miss)
                cluster.optimize()
                victims = available

            if args.linux:
                if config.mbox_use_patchwork_id:
                    log.error('Doesn\'t work with USE_PATCHWORK_ID = true')
                    return -1

                log.info('Searching for non-Linux patches...')
                repo.mbox.load_threads()
                characteristic = load_linux_mail_characteristics(repo, victims)
                linux_patches = {
                    victim
                    for victim in victims
                    if characteristic[victim].patches_linux
                }
                log.info(
                    'Will consider only %u/%u patches (%0.3f%%) as Linux'
                    'patches' % (len(linux_patches), len(victims),
                                 len(linux_patches) * 100.0 / len(victims)))
                victims = linux_patches
                repo.cache_evict_except(victims)

            # get new downstream patches since previous analysis
            new_patches = victims - cluster.get_downstream()
            log.info('Cached %d relevant mails' % len(available))
            fill_result(victims, False)

    cherries = EvaluationResult()

    if mode == 'succ':
        victims = config.psd.commits_on_stacks
        fill_result(victims, False)
        num_cpus = int(cpu_count() * args.cpu_factor)

        psd = config.psd
        global _repo
        repo = config.repo
        _repo = repo

        config.load_ccache_stack()

        evaluation_list = []
        for patch_stack in psd:
            successor = psd.get_successor(patch_stack)
            if successor == None:
                break

            log.info('Queueing %s <-> %s' %
                     (patch_stack.stack_version, successor.stack_version))
            evaluation_list.append(
                (patch_stack.commit_hashes, successor.commit_hashes))

        # cache missing commits
        repo.cache_commits(psd.commits_on_stacks)

        cherries = find_cherries(repo, psd.commits_on_stacks,
                                 psd.commits_on_stacks)

        f = partial(_evaluate_patch_list_wrapper, config.thresholds)
        log.info('Starting evaluation.')
        pool = Pool(num_cpus, maxtasksperchild=1)
        results = pool.map(f, evaluation_list, chunksize=5)
        pool.close()
        pool.join()
        log.info('  ↪ done.')
        _repo = None

        evaluation_result = EvaluationResult(False, EvaluationType.PatchStack)

        for result in results:
            evaluation_result.merge(result)

    else:  # mode is rep or upstream
        # iterate over similar patch list and get latest commit of patches
        log.info('Determining patch stack representative system')

        # Get the complete representative system
        # The lambda compares two patches of an equivalence class and chooses
        # the one with the later release version
        if mbox:
            representatives = cluster.get_representative_system(
                lambda x, y: repo.get_commit(x).author.date > repo.get_commit(
                    y).author.date)
        else:
            representatives = cluster.get_representative_system(
                lambda x, y: config.psd.is_stack_version_greater(
                    config.psd.get_stack_of_commit(x),
                    config.psd.get_stack_of_commit(y)))
        log.info('  ↪ done')

        if mode == 'upstream':
            if args.upstream_range is not None:
                candidates = set(repo.get_commithash_range(
                    args.upstream_range))
            else:
                candidates = set(config.upstream_hashes)

            # get new upstream patches since last analysis
            new_patches |= candidates - cluster.get_upstream()
            fill_result(candidates, True)

            config.load_ccache_upstream()

            # cache missing commits
            repo.cache_commits(representatives | candidates)
            repo.cache_evict_except(representatives | candidates)

            cherries = find_cherries(repo, representatives, candidates)
            type = EvaluationType.Upstream
        elif mode == 'rep':
            repo.cache_commits(representatives)
            candidates = representatives

            if not mbox:
                cherries = find_cherries(repo, representatives,
                                         config.psd.commits_on_stacks)

            type = EvaluationType.PatchStack

        if args.differential:
            representatives = representatives | new_patches
            log.info('Starting differential evaluation of %u new patches' %
                     len(new_patches))
            differential_evaluation = evaluate_commit_list(
                repo,
                config.thresholds,
                mbox,
                type,
                representatives,
                new_patches,
                parallelise=True,
                verbose=True,
                cpu_factor=args.cpu_factor)
            evaluation_result = EvaluationResult.from_file(
                config.f_evaluation_result, config.d_false_positives)
            if evaluation_result:
                evaluation_result.merge(differential_evaluation)
            else:
                evaluation_result = differential_evaluation
        else:
            log.info('Starting evaluation')
            evaluation_result = evaluate_commit_list(
                repo,
                config.thresholds,
                mbox,
                type,
                representatives,
                candidates,
                parallelise=True,
                verbose=True,
                cpu_factor=args.cpu_factor)
        log.info('  ↪ done.')

    evaluation_result.merge(cherries)
    evaluation_result.to_file(args.er_filename)
def update_book_info(isbnfile, shelf_file, info_file):
    '''根据isbn更新half.com价格'''
    global result_file  #结果文件
    global onshelf_file  #上架信息结果文件
    global success_isbn_file
    global offshelf_isbn_file
    global not_crawl_file
    global not_list_file
    # global bu_onshelf_file #condition不符合要求的记录
    # global bu_half_file #----
    global delete_file  #记录重量不合条件的ISBN,和采购价低于1的书籍信息
    # global all_fail_file
    global success_sku_file
    global lock
    lock = Lock()

    result_file = open(info_file, "w")
    onshelf_file = open(shelf_file, "w")
    # bu_onshelf_file = open('./update/info/onshelf_condition_fail.csv', "w")
    # bu_half_file = open('./update/info/half_condition_fail.csv', "w")
    success_isbn_file = open("./update/info/success_isbn.txt", "w")
    offshelf_isbn_file = open("./update/info/offshelf_isbn.txt", "w")
    not_crawl_file = open("./update/info/not_crawl.txt", "w")
    not_list_file = open("./update/info/not_found.txt", "w")
    delete_file = open("./update/info/delete_isbn.txt", "w")
    success_sku_file = open("./update/isbn/success_sku.txt", 'w')

    # all_fail_file = open("./update/isbn/fail_file.txt",'w')

    titles = [
        'ISBN', 'ISBN13', 'weight', 'auther', 'condition', 'price', 'sec_price'
    ]
    shelf_titles = [
        'sku', 'price', 'minimum-seller-allowed-price', 'auther',
        'maximum-seller-allowed-price', 'quantity', 'leadtime-to-ship'
    ]
    create_titles(info_file, titles)
    create_titles(shelf_file, shelf_titles)
    #取isbn列表
    #     path="./update/isbn"
    #     isbns=get_isbns(path)
    isbn_file = open(isbnfile)
    isbns = isbn_file.readlines()
    #isbns = isbns[:100]
    #     #print isbns

    pool = Pool(20)
    pool.map(get_book_info, isbns)
    pool.close()
    pool.join()

    result_file.close()
    onshelf_file.close()
    success_isbn_file.close()
    success_sku_file.close()
    offshelf_isbn_file.close()
    not_list_file.close()
    # bu_onshelf_file.close()
    # bu_half_file.close()
    not_crawl_file.close()
    delete_file.close()
Exemple #55
0
def initialize_topics():
    """重新初始化topic
       1. 把话题status变为initializing
       2. 设置话题起始时间
       3. 清除新闻表中每条新闻的subeventid字段、weight字段、duplicate字段、same_from字段
       4. 清除子事件表中话题的相关数据
       5. 清除子事件特征词表中话题的相关数据
    """
    topic_start = []
    """
    topicname = u'APEC2014'
    start_datetime = "2014-11-04 00:00:00"
    topic_start.append((topicname, start_datetime))

    topicname = u'手术台自拍'
    start_datetime = "2014-12-23 00:00:00"
    topic_start.append((topicname, start_datetime))

    topicname = u'呼格案'
    start_datetime = "2014-12-14 00:00:00"
    topic_start.append((topicname, start_datetime))

    topicname = u'复旦投毒案'
    start_datetime = "2014-02-18 00:00:00"
    topic_start.append((topicname, start_datetime))

    topicname = u'非法占中'
    start_datetime = "2014-09-30 00:00:00"
    topic_start.append((topicname, start_datetime))

    topicname = u'马航失联'
    start_datetime = "2014-03-10 00:00:00"
    topic_start.append((topicname, start_datetime))

    topicname = u'博鳌论坛'
    start_datetime = "2014-04-03 00:00:00"
    topic_start.append((topicname, start_datetime))

    topicname = u'昆明火车站暴恐案'
    start_datetime = "2014-03-03 00:00:00"
    topic_start.append((topicname, start_datetime))

    topicname = u'乌鲁木齐火车站暴恐'
    start_datetime = "2014-05-01 00:00:00"
    topic_start.append((topicname, start_datetime))

    topicname = u'全军政治工作会议'
    start_datetime = "2014-11-04 00:00:00"
    topic_start.append((topicname, start_datetime))
    """

    topicname = u'高校思想宣传工作'
    start_datetime = "2015-01-31 00:00:00"
    """
    topicname = u'张灵甫遗骨被埋羊圈'
    start_datetime = "2015-01-31 00:00:00"
    """

    topic_start.append((topicname, start_datetime))

    pool = Pool()
    pool.map(one_topic_clear, topic_start)
    pool.close()
    pool.join()
Exemple #56
0
class MultiProcPlugin(DistributedPluginBase):
    """Execute workflow with multiprocessing, not sending more jobs at once
    than the system can support.

    The plugin_args input to run can be used to control the multiprocessing
    execution and defining the maximum amount of memory and threads that
    should be used. When those parameters are not specified,
    the number of threads and memory of the system is used.

    System consuming nodes should be tagged:
    memory_consuming_node.interface.estimated_memory_gb = 8
    thread_consuming_node.interface.num_threads = 16

    The default number of threads and memory for a node is 1.

    Currently supported options are:

    - non_daemon : boolean flag to execute as non-daemon processes
    - n_procs: maximum number of threads to be executed in parallel
    - memory_gb: maximum memory (in GB) that can be used at once.

    """
    def __init__(self, plugin_args=None):
        # Init variables and instance attributes
        super(MultiProcPlugin, self).__init__(plugin_args=plugin_args)
        self._taskresult = {}
        self._task_obj = {}
        self._taskid = 0
        non_daemon = True
        self.plugin_args = plugin_args
        self.processors = cpu_count()
        self.memory_gb = get_system_total_memory_gb(
        ) * 0.9  # 90% of system memory

        self._timeout = 2.0
        self._event = threading.Event()

        # Check plugin args
        if self.plugin_args:
            if 'non_daemon' in self.plugin_args:
                non_daemon = plugin_args['non_daemon']
            if 'n_procs' in self.plugin_args:
                self.processors = self.plugin_args['n_procs']
            if 'memory_gb' in self.plugin_args:
                self.memory_gb = self.plugin_args['memory_gb']

        logger.debug("MultiProcPlugin starting %d threads in pool" %
                     (self.processors))

        # Instantiate different thread pools for non-daemon processes
        if non_daemon:
            # run the execution using the non-daemon pool subclass
            self.pool = NonDaemonPool(processes=self.processors)
        else:
            self.pool = Pool(processes=self.processors)

    def _wait(self):
        if len(self.pending_tasks) > 0:
            if self._config['execution']['poll_sleep_duration']:
                self._timeout = float(
                    self._config['execution']['poll_sleep_duration'])
            sig_received = self._event.wait(self._timeout)
            if not sig_received:
                logger.debug(
                    'MultiProcPlugin timeout before signal received. Deadlock averted??'
                )
            self._event.clear()

    def _async_callback(self, args):
        self._taskresult[args['taskid']] = args
        self._event.set()

    def _get_result(self, taskid):
        if taskid not in self._taskresult:
            result = None
        else:
            result = self._taskresult[taskid]
        return result

    def _report_crash(self, node, result=None):
        if result and result['traceback']:
            node._result = result['result']
            node._traceback = result['traceback']
            return report_crash(node, traceback=result['traceback'])
        else:
            return report_crash(node)

    def _clear_task(self, taskid):
        del self._task_obj[taskid]

    def _submit_job(self, node, updatehash=False):
        self._taskid += 1
        if hasattr(node.inputs, 'terminal_output'):
            if node.inputs.terminal_output == 'stream':
                node.inputs.terminal_output = 'allatonce'

        self._task_obj[self._taskid] = \
            self.pool.apply_async(run_node,
                                  (node, updatehash, self._taskid),
                                  callback=self._async_callback)
        return self._taskid

    def _close(self):
        self.pool.close()
        return True

    def _send_procs_to_workers(self, updatehash=False, graph=None):
        """ Sends jobs to workers when system resources are available.
            Check memory (gb) and cores usage before running jobs.
        """
        executing_now = []

        # Check to see if a job is available
        currently_running_jobids = np.flatnonzero((self.proc_pending == True) & \
                                (self.depidx.sum(axis=0) == 0).__array__())

        # Check available system resources by summing all threads and memory used
        busy_memory_gb = 0
        busy_processors = 0
        for jobid in currently_running_jobids:
            if self.procs[jobid]._interface.estimated_memory_gb <= self.memory_gb and \
                            self.procs[jobid]._interface.num_threads <= self.processors:

                busy_memory_gb += self.procs[
                    jobid]._interface.estimated_memory_gb
                busy_processors += self.procs[jobid]._interface.num_threads

            else:
                raise ValueError(
                    "Resources required by jobid %d (%f GB, %d threads)"
                    "exceed what is available on the system (%f GB, %d threads)"
                    %
                    (jobid, self.procs[jobid].__interface.estimated_memory_gb,
                     self.procs[jobid].__interface.num_threads, self.memory_gb,
                     self.processors))

        free_memory_gb = self.memory_gb - busy_memory_gb
        free_processors = self.processors - busy_processors

        # Check all jobs without dependency not run
        jobids = np.flatnonzero((self.proc_done == False) & \
                                (self.depidx.sum(axis=0) == 0).__array__())

        # Sort jobs ready to run first by memory and then by number of threads
        # The most resource consuming jobs run first
        jobids = sorted(jobids,
                        key=lambda item:
                        (self.procs[item]._interface.estimated_memory_gb, self.
                         procs[item]._interface.num_threads))

        if str2bool(config.get('execution', 'profile_runtime')):
            logger.debug('Free memory (GB): %d, Free processors: %d',
                         free_memory_gb, free_processors)

        # While have enough memory and processors for first job
        # Submit first job on the list
        for jobid in jobids:
            if str2bool(config.get('execution', 'profile_runtime')):
                logger.debug('Next Job: %d, memory (GB): %d, threads: %d' \
                             % (jobid,
                                self.procs[jobid]._interface.estimated_memory_gb,
                                self.procs[jobid]._interface.num_threads))

            if self.procs[jobid]._interface.estimated_memory_gb <= free_memory_gb and \
               self.procs[jobid]._interface.num_threads <= free_processors:
                logger.info('Executing: %s ID: %d' %
                            (self.procs[jobid]._id, jobid))
                executing_now.append(self.procs[jobid])

                if isinstance(self.procs[jobid], MapNode):
                    try:
                        num_subnodes = self.procs[jobid].num_subnodes()
                    except Exception:
                        etype, eval, etr = sys.exc_info()
                        traceback = format_exception(etype, eval, etr)
                        report_crash(self.procs[jobid], traceback=traceback)
                        self._clean_queue(jobid, graph)
                        self.proc_pending[jobid] = False
                        continue
                    if num_subnodes > 1:
                        submit = self._submit_mapnode(jobid)
                        if not submit:
                            continue

                # change job status in appropriate queues
                self.proc_done[jobid] = True
                self.proc_pending[jobid] = True

                free_memory_gb -= self.procs[
                    jobid]._interface.estimated_memory_gb
                free_processors -= self.procs[jobid]._interface.num_threads

                # Send job to task manager and add to pending tasks
                if self._status_callback:
                    self._status_callback(self.procs[jobid], 'start')
                if str2bool(self.procs[jobid].config['execution']
                            ['local_hash_check']):
                    logger.debug('checking hash locally')
                    try:
                        hash_exists, _, _, _ = self.procs[jobid].hash_exists()
                        logger.debug('Hash exists %s' % str(hash_exists))
                        if (hash_exists and
                            (self.procs[jobid].overwrite == False or
                             (self.procs[jobid].overwrite == None and
                              not self.procs[jobid]._interface.always_run))):
                            self._task_finished_cb(jobid)
                            self._remove_node_dirs()
                            continue
                    except Exception:
                        etype, eval, etr = sys.exc_info()
                        traceback = format_exception(etype, eval, etr)
                        report_crash(self.procs[jobid], traceback=traceback)
                        self._clean_queue(jobid, graph)
                        self.proc_pending[jobid] = False
                        continue
                logger.debug('Finished checking hash')

                if self.procs[jobid].run_without_submitting:
                    logger.debug('Running node %s on master thread' \
                                 % self.procs[jobid])
                    try:
                        self.procs[jobid].run()
                    except Exception:
                        etype, eval, etr = sys.exc_info()
                        traceback = format_exception(etype, eval, etr)
                        report_crash(self.procs[jobid], traceback=traceback)
                    self._task_finished_cb(jobid)
                    self._remove_node_dirs()

                else:
                    logger.debug('MultiProcPlugin submitting %s' % str(jobid))
                    tid = self._submit_job(deepcopy(self.procs[jobid]),
                                           updatehash=updatehash)
                    if tid is None:
                        self.proc_done[jobid] = False
                        self.proc_pending[jobid] = False
                    else:
                        self.pending_tasks.insert(0, (tid, jobid))
            else:
                break
Exemple #57
0
class CeleryExecutor(BaseExecutor):
    """
    CeleryExecutor is recommended for production use of Airflow. It allows
    distributing the execution of task instances to multiple worker nodes.

    Celery is a simple, flexible and reliable distributed system to process
    vast amounts of messages, while providing operations with the tools
    required to maintain such a system.
    """
    def __init__(self):
        super().__init__()

        # Celery doesn't support querying the state of multiple tasks in parallel
        # (which can become a bottleneck on bigger clusters) so we use
        # a multiprocessing pool to speed this up.
        # How many worker processes are created for checking celery task state.
        self._sync_parallelism = conf.getint('celery', 'SYNC_PARALLELISM')
        if self._sync_parallelism == 0:
            self._sync_parallelism = max(1, cpu_count() - 1)

        self._sync_pool = None
        self.tasks = {}
        self.last_state = {}

    def start(self) -> None:
        self.log.debug(
            'Starting Celery Executor using %s processes for syncing',
            self._sync_parallelism)

    def _num_tasks_per_send_process(self, to_send_count: int) -> int:
        """
        How many Celery tasks should each worker process send.

        :return: Number of tasks that should be sent per process
        :rtype: int
        """
        return max(
            1, int(math.ceil(1.0 * to_send_count / self._sync_parallelism)))

    def _num_tasks_per_fetch_process(self) -> int:
        """
        How many Celery tasks should be sent to each worker process.

        :return: Number of tasks that should be used per process
        :rtype: int
        """
        return max(
            1, int(math.ceil(1.0 * len(self.tasks) / self._sync_parallelism)))

    def trigger_tasks(self, open_slots: int) -> None:
        """
        Overwrite trigger_tasks function from BaseExecutor

        :param open_slots: Number of open slots
        :return:
        """
        sorted_queue = self.order_queued_tasks_by_priority()

        task_tuples_to_send: List[TaskInstanceInCelery] = []

        for _ in range(min((open_slots, len(self.queued_tasks)))):
            key, (command, _, queue, simple_ti) = sorted_queue.pop(0)
            task_tuples_to_send.append(
                (key, simple_ti, command, queue, execute_command))

        cached_celery_backend = None
        if task_tuples_to_send:
            tasks = [t[4] for t in task_tuples_to_send]

            # Celery state queries will stuck if we do not use one same backend
            # for all tasks.
            cached_celery_backend = tasks[0].backend

        if task_tuples_to_send:
            # Use chunks instead of a work queue to reduce context switching
            # since tasks are roughly uniform in size
            chunksize = self._num_tasks_per_send_process(
                len(task_tuples_to_send))
            num_processes = min(len(task_tuples_to_send),
                                self._sync_parallelism)

            send_pool = Pool(processes=num_processes)
            key_and_async_results = send_pool.map(send_task_to_executor,
                                                  task_tuples_to_send,
                                                  chunksize=chunksize)

            send_pool.close()
            send_pool.join()
            self.log.debug('Sent all tasks.')

            for key, command, result in key_and_async_results:
                if isinstance(result, ExceptionWithTraceback):
                    self.log.error(  # pylint: disable=logging-not-lazy
                        CELERY_SEND_ERR_MSG_HEADER + ":%s\n%s\n",
                        result.exception, result.traceback)
                elif result is not None:
                    # Only pops when enqueued successfully, otherwise keep it
                    # and expect scheduler loop to deal with it.
                    self.queued_tasks.pop(key)
                    result.backend = cached_celery_backend
                    self.running.add(key)
                    self.tasks[key] = result
                    self.last_state[key] = celery_states.PENDING

    def sync(self) -> None:
        num_processes = min(len(self.tasks), self._sync_parallelism)
        if num_processes == 0:
            self.log.debug("No task to query celery, skipping sync")
            return

        self.log.debug("Inquiring about %s celery task(s) using %s processes",
                       len(self.tasks), num_processes)

        # Recreate the process pool each sync in case processes in the pool die
        self._sync_pool = Pool(processes=num_processes)

        # Use chunks instead of a work queue to reduce context switching since tasks are
        # roughly uniform in size
        chunksize = self._num_tasks_per_fetch_process()

        self.log.debug("Waiting for inquiries to complete...")
        task_keys_to_states = self._sync_pool.map(fetch_celery_task_state,
                                                  self.tasks.items(),
                                                  chunksize=chunksize)
        self._sync_pool.close()
        self._sync_pool.join()
        self.log.debug("Inquiries completed.")

        self.update_task_states(task_keys_to_states)

    def update_task_states(
        self, task_keys_to_states: List[Union[TaskInstanceStateType,
                                              ExceptionWithTraceback]]
    ) -> None:
        """Updates states of the tasks."""
        for key_and_state in task_keys_to_states:
            if isinstance(key_and_state, ExceptionWithTraceback):
                self.log.error(  # pylint: disable=logging-not-lazy
                    CELERY_FETCH_ERR_MSG_HEADER + ", ignoring it:%s\n%s\n",
                    repr(key_and_state.exception), key_and_state.traceback)
                continue
            key, state = key_and_state
            self.update_task_state(key, state)

    def update_task_state(self, key: TaskInstanceKeyType, state: str) -> None:
        """Updates state of a single task."""
        # noinspection PyBroadException
        try:
            if self.last_state[key] != state:
                if state == celery_states.SUCCESS:
                    self.success(key)
                    del self.tasks[key]
                    del self.last_state[key]
                elif state == celery_states.FAILURE:
                    self.fail(key)
                    del self.tasks[key]
                    del self.last_state[key]
                elif state == celery_states.REVOKED:
                    self.fail(key)
                    del self.tasks[key]
                    del self.last_state[key]
                else:
                    self.log.info("Unexpected state: %s", state)
                    self.last_state[key] = state
        except Exception:  # pylint: disable=broad-except
            self.log.exception(
                "Error syncing the Celery executor, ignoring it.")

    def end(self, synchronous: bool = False) -> None:
        if synchronous:
            while any([
                    task.state not in celery_states.READY_STATES
                    for task in self.tasks.values()
            ]):
                time.sleep(5)
        self.sync()

    def execute_async(self,
                      key: TaskInstanceKeyType,
                      command: CommandType,
                      queue: Optional[str] = None,
                      executor_config: Optional[Any] = None):
        """Do not allow async execution for Celery executor."""
        raise AirflowException("No Async execution for Celery executor.")

    def terminate(self):
        pass
def produce_bam_custom(kmers_trie, name, label, guides_filename, args,
                       offdist, maxoffcount, processes, n, parts):
    """Produce BAM file with guideRNA database.

    Run after all files and trie were generated
    by kmers.extract_process_kmers() and guides.analyze_guides()

    Produce files:
    sorted BAM file with off-target info:
        <name>/<name>_guides_<label>.bam
    index for the BAM file with off-target info:
        <name>/<name>_guides_<label>.bam.bai

    Args:
    kmers_trie: trie.trie object with all guideRNAs as produced by
                guides.analyze_guides()
    name: project name, used to get project args and in all output
    label: str, add it to file name of output database for this run
    guides_filename: name of file with all k-mers that are considered good
                     candidate guideRNAs, one per line;
                     if file name ends with .gz assume file is gzipped;
    args: arguments of the project, used to print some info in SAM header
    offdist: maximum Hamming distance to consider from guideRNA to
             its off-target;
             use -1 for omitting any off-target info in resulting BAM
             (works much faster);
             running time icreases somewhat exponentially
             as this value increases; offdist=4 may be infeasible
             when running genome-wide analysis on mammalian genome
    maxoffcount: store at most this many off-targets for a guideRNA;
                 ignore if offdist is -1
    processes: int, how many processes to use in parallel; do not specify more
             than available in the system; currently not implemented, use 1
    """
    guidesfiles = []
    # parts = 256
    tempdir = '%s%s' % (name,'/classifiedfiles/tempfiles')

    util.print_log('produce SAM files...')
    samfiles = ['%s/%s.sam' % (tempdir, i) for i in range(parts)]
    # samfiles = [tempfile.NamedTemporaryFile(dir=name, suffix='.sam%s' % i)
    #             for i in xrange(parts)]
    # util.print_log('store SAM in these files (gzipped): %s'
    #                % (', '.join([basename(f.name) for f in samfiles])))

        
    if isinstance(guides_filename, str):

        util.print_log('split %s in %s parts...' % (guides_filename, parts))
        guidesfiles = [tempfile.NamedTemporaryFile(dir=name,
                                                   suffix='.guides%s' % i)
                       for i in range(parts)]
        util.print_log('store guides in these files: %s'
                       % (', '.join([basename(f.name) for f in guidesfiles])))
        guidesfile = gzip.open(guides_filename) \
                     if guides_filename.endswith('.gz') \
                     else open(guides_filename)
        index_num = 0
        guidecount = 0
        for line in guidesfile:
            kmer1 = line.split()[0][0:n]
            index_num = guides.get_num(kmer1, n)
            guidesfiles[index_num].write(line)
            
            guidecount += 1
            
        guidesfile.close()
        for f in guidesfiles:
            f.flush()
        util.print_log('%s guideRNAs to process' % guidecount)
        util.print_log('done')

        process_list = []
        all_task = Queue()
        for i in range(parts):
            task = (guides_filename[i].name, samfiles[i].name, i)
            all_task.put(task)

        for i in range(processes):
            p = Process(target=process_pool, args=(all_task, kmers_trie, args, offdist, maxoffcount, i, n, parts))
            p.start()
            process_list.append(p)

        for p in process_list:
            p.join()
                
        for i in range(parts):
            guidesfiles[i].close()

    else:       
        process_list = []
        all_task = Queue()
        for i in range(parts):
            task = (guides_filename[i], samfiles[i], i)
            all_task.put(task)

        for i in range(processes):
            p = Process(target=process_pool, args=(all_task, kmers_trie, args, offdist, maxoffcount, i, n, parts))
            p.start()
            process_list.append(p)

        for p in process_list:
            p.join()

    util.print_log('produce sorted BAM files...')
    
    bamfiles = ['%s/%s.bam' % (tempdir, i) for i in range(parts)]
    # bamfiles = [tempfile.NamedTemporaryFile(dir=name, suffix='.bam%s' % i)
    #             for i in xrange(parts)]
    # util.print_log('store BAM in these files: %s'
    #                % (', '.join([basename(f.name) for f in bamfiles])))

    pool = Pool(processes)
    util.print_log('poolSize %s...' % processes)
    index=False
    for i in range(parts):
        pool.apply_async(sam_to_bam,(samfiles[i], bamfiles[i], index,))
    util.print_log('Waiting for all subprocesses done...')
    pool.close()
    pool.join()

    # for i in xrange(parts):
    #     samfiles[i].close()
    util.print_log('merge into one BAM file...')
    bamfile = '%s/%s_guides%s.bam' % (name, name,
                                      '_%s' % label if label else '')
    util.print_log('store in %s' % bamfile)
    util.warn_file_exists(bamfile)
    if parts > 1000:
        mid = parts // 2
        bamfiles_temp = [tempfile.NamedTemporaryFile(dir=name, suffix='.bam%s' % i)
                        for i in xrange(2)]
        samtools_command1 = 'samtools merge -f %s %s' \
                           % (bamfiles_temp[0].name, ' '.join(bamfiles[0:mid]))
        os.system(samtools_command1)

        samtools_command2 = 'samtools merge -f %s %s' \
                           % (bamfiles_temp[1].name, ' '.join(bamfiles[mid:parts]))
        os.system(samtools_command2)

        samtools_command = 'samtools merge -f %s %s' \
                           % (bamfile, ' '.join([f.name for f in bamfiles_temp]))
        os.system(samtools_command)

        for f in bamfiles_temp:
            f.close()

    else:
        samtools_command = 'samtools merge -f %s %s' \
                           % (bamfile, ' '.join(bamfiles))
        # print samtools_command
        os.system(samtools_command)
    samtools_index_command = 'samtools index %s' % bamfile
    # print samtools_index_command
    os.system(samtools_index_command)
    util.print_log('done')
    # for i in xrange(parts):
    #     bamfiles[i].close()

    for i in range(parts):
        if(os.path.exists(samfiles[i])):
            os.remove(samfiles[i])
        if(os.path.exists(bamfiles[i])):
            os.remove(bamfiles[i])

    util.print_log('samtools version')
    samtools_version_command = 'samtools --version'
    print samtools_version_command
    os.system(samtools_version_command)
Exemple #59
0
def eval_map(det_results,
             annotations,
             scale_ranges=None,
             iou_thr=0.5,
             dataset=None,
             logger=None,
             tpfp_fn=None,
             nproc=4):
    """Evaluate mAP of a dataset.

    Args:
        det_results (list[list]): [[cls1_det, cls2_det, ...], ...].
            The outer list indicates images, and the inner list indicates
            per-class detected bboxes.
        annotations (list[dict]): Ground truth annotations where each item of
            the list indicates an image. Keys of annotations are:

            - `bboxes`: numpy array of shape (n, 4)
            - `labels`: numpy array of shape (n, )
            - `bboxes_ignore` (optional): numpy array of shape (k, 4)
            - `labels_ignore` (optional): numpy array of shape (k, )
        scale_ranges (list[tuple] | None): Range of scales to be evaluated,
            in the format [(min1, max1), (min2, max2), ...]. A range of
            (32, 64) means the area range between (32**2, 64**2).
            Default: None.
        iou_thr (float): IoU threshold to be considered as matched.
            Default: 0.5.
        dataset (list[str] | str | None): Dataset name or dataset classes,
            there are minor differences in metrics for different datsets, e.g.
            "voc07", "imagenet_det", etc. Default: None.
        logger (logging.Logger | str | None): The way to print the mAP
            summary. See `mmdet.utils.print_log()` for details. Default: None.
        tpfp_fn (callable | None): The function used to determine true/
            false positives. If None, :func:`tpfp_default` is used as default
            unless dataset is 'det' or 'vid' (:func:`tpfp_imagenet` in this
            case). If it is given as a function, then this function is used
            to evaluate tp & fp. Default None.
        nproc (int): Processes used for computing TP and FP.
            Default: 4.

    Returns:
        tuple: (mAP, [dict, dict, ...])
    """
    assert len(det_results) == len(annotations)

    num_imgs = len(det_results)
    num_scales = len(scale_ranges) if scale_ranges is not None else 1
    num_classes = len(det_results[0])  # positive class num
    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
                   if scale_ranges is not None else None)

    pool = Pool(nproc)
    eval_results = []
    for i in range(num_classes):
        # get gt and det bboxes of this class
        cls_dets, cls_gts, cls_gts_ignore = get_cls_results(
            det_results, annotations, i)
        # choose proper function according to datasets to compute tp and fp
        if tpfp_fn is None:
            if dataset in ['det', 'vid']:
                tpfp_fn = tpfp_imagenet
            else:
                tpfp_fn = tpfp_default
        if not callable(tpfp_fn):
            raise ValueError(
                f'tpfp_fn has to be a function or None, but got {tpfp_fn}')

        # compute tp and fp for each image with multiple processes
        tpfp = pool.starmap(
            tpfp_fn,
            zip(cls_dets, cls_gts, cls_gts_ignore,
                [iou_thr for _ in range(num_imgs)],
                [area_ranges for _ in range(num_imgs)]))
        tp, fp = tuple(zip(*tpfp))
        # calculate gt number of each scale
        # ignored gts or gts beyond the specific scale are not counted
        num_gts = np.zeros(num_scales, dtype=int)
        for j, bbox in enumerate(cls_gts):
            if area_ranges is None:
                num_gts[0] += bbox.shape[0]
            else:
                gt_areas = (bbox[:, 2] - bbox[:, 0]) * (
                    bbox[:, 3] - bbox[:, 1])
                for k, (min_area, max_area) in enumerate(area_ranges):
                    num_gts[k] += np.sum((gt_areas >= min_area)
                                         & (gt_areas < max_area))
        # sort all det bboxes by score, also sort tp and fp
        cls_dets = np.vstack(cls_dets)
        num_dets = cls_dets.shape[0]
        sort_inds = np.argsort(-cls_dets[:, -1])
        tp = np.hstack(tp)[:, sort_inds]
        fp = np.hstack(fp)[:, sort_inds]
        # calculate recall and precision with tp and fp
        tp = np.cumsum(tp, axis=1)
        fp = np.cumsum(fp, axis=1)
        eps = np.finfo(np.float32).eps
        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
        precisions = tp / np.maximum((tp + fp), eps)
        # calculate AP
        if scale_ranges is None:
            recalls = recalls[0, :]
            precisions = precisions[0, :]
            num_gts = num_gts.item()
        mode = 'area' if dataset != 'voc07' else '11points'
        ap = average_precision(recalls, precisions, mode)
        eval_results.append({
            'num_gts': num_gts,
            'num_dets': num_dets,
            'recall': recalls,
            'precision': precisions,
            'ap': ap
        })
    pool.close()
    if scale_ranges is not None:
        # shape (num_classes, num_scales)
        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
        all_num_gts = np.vstack(
            [cls_result['num_gts'] for cls_result in eval_results])
        mean_ap = []
        for i in range(num_scales):
            if np.any(all_num_gts[:, i] > 0):
                mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean())
            else:
                mean_ap.append(0.0)
    else:
        aps = []
        for cls_result in eval_results:
            if cls_result['num_gts'] > 0:
                aps.append(cls_result['ap'])
        mean_ap = np.array(aps).mean().item() if aps else 0.0

    print_map_summary(
        mean_ap, eval_results, dataset, area_ranges, logger=logger)

    return mean_ap, eval_results
Exemple #60
0
    q = Manager().Queue()  
    po = Pool()
    lock = Manager().Lock()
    
    dir_list = get_img_dir_list(parser.parse_args().imgDir)
    args = parser.parse_args()

    length = len(dir_list)
    if (0 == args.gpus):
        start = 0
        end = length / 2
    else:
        start = length / 2 - 1
        end = length

    for i in range(start, end):
        q.put(dir_list[i])
    
    print 'q.qsize=', q.qsize()
    
    for i in range(0, 5):
        po.apply_async(task, args=(parser.parse_args(), q, lock))
    
    po.close()
    po.join()
    multiprocessing.freeze_support()

    print "Exiting Main Thread"