def handle(self, *args, **options):

        pool = Pool(settings.NUM_THREADS)
        conf = settings.TRAINER_CURRENCY_CONFIG['supervised_nn']

        print("Starting V2 run")
        for ticker in conf['ticker']:
            for hidden_layers in conf['hidden_layers']:
                for min_back in conf['min_back']:
                    for epochs in conf['epochs']:
                        for granularity in conf['granularity']:
                            for datasetinputs in conf['datasetinputs']:
                                for bias in conf['bias']:
                                    for momentum in conf['momentum']:
                                        for learningrate in conf['learningrate']:
                                            for weightdecay in conf['weightdecay']:
                                                for recurrent in conf['recurrent']:
                                                    for timedelta_back_in_granularity_increments in \
                                                            conf['timedelta_back_in_granularity_increments']:
                                                        pool.apply_async(do_prediction_test, args=(
                                                            ticker, hidden_layers, min_back, epochs, granularity,
                                                            datasetinputs,
                                                            learningrate, bias, momentum, recurrent, weightdecay,
                                                            timedelta_back_in_granularity_increments
                                                        ))
        print("All V2 jobs queued")
        pool.close()
        pool.join()
        print("V2 run complete")
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(description='Analyze a bandersnatch mirror.')
    parser.add_argument('--json',
                       help='save raw data to a json file',
                       default=None)
    args = parser.parse_args()
    concurrency = 8
    root = "/var/spool/pypi/web/packages/source/"
    p = Pool()
    results = {}
    try:
        try:
            for path, result in \
                p.imap_unordered(analyse_sdist, yield_packages(root)):
                results[path] = result
            p.close()
        except:
            p.terminate()
            raise
    finally:
        p.join()
    if args.json:
        with open(args.json, 'wb') as f:
            f.write(json.dumps(results))
    pprint.pprint(results)
def build_from_queries(queries):
    p = Pool(5)
    query_results = p.map(q_exec, queries)
    p.close()
    p.join()
    #process the query_results
    return query_results
def averageSimilarityMatrix(dictOfClassesLocal, dictOfWeightsLocal,title="Cluster similarity matrix",savePlot=False):
	global bar, progressCount, dictOfClasses, dictOfWeights, arguments, distances
	dictOfClasses = dictOfClassesLocal
	dictOfWeights = dictOfWeightsLocal
	print "Computing similarity matrix..."
	bar = progressbar.ProgressBar(maxval=len(dictOfClasses.keys())**2, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
	bar.start()
	progressCount = 0

	# Initialise task matrix
	arguments = []
	distances = []
	for i,k in zip(range(len(dictOfClasses.keys())),reversed(sorted(dictOfClasses.keys()))):
		arguments.append([])
		distances.append([])
		for j in sorted(dictOfClasses.keys()):
			arguments[i].append((dictOfClasses[k],dictOfClasses[j],dictOfWeights[k],dictOfWeights[j]))
			distances[i].append([])

	# Distribute tasks
	poo = Pool()
	for i in range(len(dictOfClasses.keys())**2):
		poo.apply_async(interClassDistance,(i,),callback=updateResultMatrix)
	poo.close()
	poo.join()

	bar.finish()
	plot.plotSimilarityMatrix(distances,sorted(dictOfClasses.keys()),title,savePlot)
Exemple #5
0
def main():
    global output_doc_path
    if output_doc_path == '':
        output_doc_path = os.path.join(os.path.split(input_doc_path)[0], 'outputTinypng')
    if not os.path.exists(output_doc_path):
        os.mkdir(output_doc_path)

    for parent,dirnames,filenames in os.walk(input_doc_path):    #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字
      for dirname in  dirnames:                       #输出文件夹信息
        # print("parent is:" + parent)
        # print("dirname is" + dirname)
        outDir = os.path.join(output_doc_path,os.path.relpath(os.path.join(parent,dirname),input_doc_path))
        if not os.path.exists(outDir):
            os.mkdir(outDir)

      for filename in filenames:                        #输出文件信息
        # print("parent is:" + parent)
        # print("filename is:" + filename)
        filePaths.append(os.path.join(parent,filename))

    pngFilePaths = filter(lambda x:os.path.splitext(x)[1]=='.png' or os.path.splitext(x)[1]=='.jpg',filePaths)
    print('Parent process %s.' % os.getpid())
    p = Pool(poolLimite)
    for fileName in pngFilePaths:
        p.apply_async(getTinyPng, args=(fileName,))
    print('Waiting for all subprocesses done...')
    p.close()
    p.join()
    print('All subprocesses done.')
Exemple #6
0
def calcSynScores(scoresO,aabrhRawScoreSummmaryD,geneNames,geneOrderT,synWSize,numSynToTake,numThreads):
    '''Calculate the synteny score between two genes and add to edge
attributes of scoresO. We only bother making synteny scores for those
genes that have an edge in scoresO.
    '''
    
    neighborTL = createNeighborL(geneNames,geneOrderT,synWSize)

    # make list of groups of arguments to be passed to p.map. There
    # should be numThreads groups.
    argumentL = [([],neighborTL,numSynToTake,geneNames,aabrhRawScoreSummmaryD,scoresO) for i in range(numThreads)]

    i=0
    for gn1,gn2 in scoresO.iterateEdgesByEndNodes():
        argumentL[i%numThreads][0].append((gn1,gn2))
        i+=1

    p=Pool(numThreads) # num threads
    synScoresLL = p.map(synScoreGroup, argumentL)
    p.close()
    p.join()
    
    # add to scores object
    for synScoresL in synScoresLL:
        for gn1,gn2,sc in synScoresL:
            scoresO.addScoreByEndNodes(gn1,gn2,sc,'synSc')

    return scoresO
def expand_all_commits(code_dir, target_dir, only_year=None):
  print code_dir
  uname_lookup_by_year_q = load_uname_lookup_by_year_q()
  latest_submissions = get_latest_submissions(code_dir)
  num_students = len(latest_submissions)

  def get_commit_args(args):
    i, student = args
    latest_submit = latest_submissions[student]
    student_dir = os.path.join(code_dir, latest_submit)
    year_q = get_submit_time(student_dir) 
    if (not year_q) or only_year != year_q: return (-1,'','',-1,'',-1)
    year_target_dir = os.path.join(target_dir, year_q)
    if year_q not in uname_lookup_by_year_q or \
          latest_submit not in uname_lookup_by_year_q[year_q]:
        add_uname_to_lookup(latest_submit, year_q, uname_lookup_by_year_q)
    student_id = uname_lookup_by_year_q[year_q][latest_submit]
    #if student_id != '2012010247': return (-1,'','',-1,'',-1)
    return i, student, student_dir, student_id, year_target_dir, num_students

  students = sorted(latest_submissions.keys())
  zipped_args = map(get_commit_args, enumerate(students))
  non_students = [student for i, student in enumerate(students) if zipped_args[i][0] == -1]
  #print "unsuccessful"
  #print '\n'.join([latest_submissions[student] for student in non_students])
  pool = ThreadPool(8)
  results = pool.map(thread_process_commit, zipped_args)
  pool.close()
  pool.join()
  export_uname_lookup_by_year_q(uname_lookup_by_year_q)
Exemple #8
0
def import_images(folder, par=True, ttime=True):
  """
  This function loads images from a folder as PIL Image files and
  thresholds them, creating a list of z-slices to be turned into a matrix
  This version is not currently used.
  """
  fils = [os.listdir(folder)]
  def keep_tifs(rawlist):
    tiflist = []
    for f in rawlist:
      if len(f.split('.'))>1:
        if f.split('.')[1] == 'tif':
          tiflist.append(f)
    return tiflist
  tiflist = keep_tifs(fils)
  newtiflist = [folder+f for f in tiflist].sort() # alphabetize
  tifobjs = [load_img_array(f) for f in tiflist]
  
  # here start parallel stuff
  if par or ttime:
    start_time_par = timer()
    pool = Pool(8)
    results_par = pool.map(show_at_thresh, tifobjs)
    pool.close()
    pool.join()
    total_time_par = timer() - start_time_par
  # or non-parallel stuff
  elif par==False or ttime:
    start_time_nopar = timer()
    results_nopar = [show_at_thresh(f) for f in newtiflist]
    total_time_nopar = timer() - start_time_nopar
  print('Time for parallel: %.2f seconds' % total_time_par)
  print('Time for non-parallel: %.2f seconds' % total_time_nopar)
  
  return results_par, results_nopar
Exemple #9
0
def calcRawScores(fastaFilePath,numThreads,geneNames,gapOpen, gapExtend, matrix, scoresO):
    '''Get a global alignment based raw score for every edge in scoresO.'''

    # load sequences
    protFnL=glob.glob(fastaFilePath)
    seqD=genomes.loadProt(protFnL)
                
    # make list of sets of arguments to be passed to p.map. There
    # should be numThreads sets.
    argumentL = [([],seqD,gapOpen, gapExtend, matrix) for i in range(numThreads)]

    i=0
    for g1,g2 in scoresO.iterateEdgesByEndNodes():
        edgeNum = scoresO.endNodesToEdge(g1,g2)
        edgeT = edgeNum,geneNames.numToName(g1),geneNames.numToName(g2)
        argumentL[i%numThreads][0].append(edgeT)
        i+=1
        
    # run
    p=Pool(numThreads)
    scoresLL = p.map(rawScoreGroup, argumentL)
    p.close()
    p.join()

    
    # store in scoresO
    for scoresL in scoresLL:
        for edgeNum,sc in scoresL:
            scoresO.addScoreByEdge(edgeNum,sc,'rawSc')

    return scoresO
Exemple #10
0
 def correction_terms_threaded(self):
     '''Finds the correction terms assoctiated to the quadratic form,
     for each of the equivalance classes it finds the maximum by 
     iterating through the relation vectors of the group. 
     
     Uses multiprocessing.'''
     print 'Using multiprocessing'
     pool = Pool() # default: processes=None => uses cpu_count()
     manager = Manager()
     start_time = time.time()
     coef_lists = lrange(self.group.structure)
     # representatives = elements of C_1(V) (np.matrix)
     representatives = map(lambda l: self.find_rep(l), coef_lists)
     # list of maxes        
     lst = manager.list([None for i in xrange(len(representatives))]) 
     alphalist = list(self.get_alpha()) # cannot pickle generators
     pool.map_async(functools.partial(process_alpha_outside, self, 
                                      representatives, lst), alphalist)
     pool.close()
     pool.join() # wait for pool to finish
     # get corrterms via (|alpha|^2+b)/4
     print 'Computed from quadratic form in %g seconds' \
           % (time.time() - start_time)
     return [Fraction(Fraction(alpha, self.int_inverse[1]) + self.b, 4) \
                     for alpha in lst]            
Exemple #11
0
def getData():
    if os.path.isfile("chat_urls.p"):
        chat_urls = pickle.load( open( "chat_urls.p", "rb" ) )
    else:
        chat_urls = {}
        for user in users:
            chat_urls[user] = get_urls(user)
        teams_url = "http://espn.go.com/mlb/teams"
        pickle.dump( chat_urls, open( "chat_urls.p", "wb" ) )

    # for user in chat_urls:
    #     urls = chat_urls[user]
    #     for url in urls:
    #         getLog(url)
    logDB = {}
    for user in chat_urls:
        logDB[user] = {}
    p = Pool(20)
    i=0
    manager = Manager()
    db = manager.dict()
    for user in chat_urls:
        for url in chat_urls[user]:
            i+=1
            p.apply_async(addLogData, args=(url,db))
    p.close()
    p.join()
    out = db._getvalue()
    outfile = open("rawChat.txt","wb")
    for url in out:
        outfile.write(out[url]+"\n")
Exemple #12
0
def multiprocessing_method(n_of_darts, n_of_pools = 10):
	'''Uses 10 processes to split the work of dart_algorithm. Since the processes are independent, \
	I split the number of darts over the number of processes, and used the pool function. Instead of \
	looping over the number of processes, which would be time-consuming, I wrote them explicitly.'''


	pool = Pool(processes=n_of_pools)  
	darts_per_pool = n_of_darts/n_of_pools

	start=time()
	result1 = pool.map_async(dart_algorithm, [darts_per_pool])
	result2 = pool.map_async(dart_algorithm, [darts_per_pool])
	result3 = pool.map_async(dart_algorithm, [darts_per_pool])
	result4 = pool.map_async(dart_algorithm, [darts_per_pool])
	result5 = pool.map_async(dart_algorithm, [darts_per_pool])
	result6 = pool.map_async(dart_algorithm, [darts_per_pool])
	result7 = pool.map_async(dart_algorithm, [darts_per_pool])
	result8 = pool.map_async(dart_algorithm, [darts_per_pool])
	result9 = pool.map_async(dart_algorithm, [darts_per_pool])
	result10 = pool.map_async(dart_algorithm, [darts_per_pool])
	end=time()
	
	Pool_darts_in_circle = result1.get()[0] + result2.get()[0]+result3.get()[0] +\
	result4.get()[0]+result5.get()[0]+result6.get()[0]+ result7.get()[0]+ \
	result8.get()[0]+ result9.get()[0]+ result10.get()[0]
	multiprocessing_time = end-start
	
	pool.close()
	pool.join()
	
	pi = 4 * Pool_darts_in_circle / float(n_of_darts)
	
	# print "multiprocessing pi approximation= ", pi
	
	return multiprocessing_time
Exemple #13
0
def dirImgProcess(path):
    global workerPool, workerOutput, theGreatIndex
    workerPool = Pool()
    workerOutput = []
    work = []
    theGreatIndex = {}
    pagenumber = 0
    for (dirpath, dirnames, filenames) in os.walk(path):
        for afile in filenames:
            if getImageFileName(afile) is not None:
                pagenumber += 1
                work.append([afile, dirpath, options])
    if GUI:
        GUI.progressBarTick.emit(str(pagenumber))
    if len(work) > 0:
        for i in work:
            workerPool.apply_async(func=fileImgProcess, args=(i, ), callback=fileImgProcess_tick)
        workerPool.close()
        workerPool.join()
        if GUI and not GUI.conversionAlive:
            rmtree(os.path.join(path, '..', '..'), True)
            raise UserWarning("Conversion interrupted.")
        if len(workerOutput) > 0:
            rmtree(os.path.join(path, '..', '..'), True)
            raise RuntimeError("One of workers crashed. Cause: " + workerOutput[0])
    else:
        rmtree(os.path.join(path, '..', '..'), True)
        raise UserWarning("Source directory is empty.")
def test_word2id():
    """把测试集的所有词转成对应的id。"""
    time0 = time.time()
    print('Processing eval data.')
    df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t', usecols=[0, 2, 4],
                          names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
    print('test question number %d' % len(df_eval))
    # 没有 title 的问题用 content 来替换
    na_title_indexs = list()
    for i in xrange(len(df_eval)):
        word_title = df_eval.word_title.values[i]
        if type(word_title) is float:
            na_title_indexs.append(i)
    print('There are %d test questions without title.' % len(na_title_indexs))
    for na_index in na_title_indexs:
        df_eval.at[na_index, 'word_title'] = df_eval.at[na_index, 'word_content']
    # 没有 content 的问题用 title 来替换
    na_content_indexs = list()
    for i in tqdm(xrange(len(df_eval))):
        word_content = df_eval.word_content.values[i]
        if type(word_content) is float:
            na_content_indexs.append(i)
    print('There are %d test questions without content.' % len(na_content_indexs))
    for na_index in tqdm(na_content_indexs):
        df_eval.at[na_index, 'word_content'] = df_eval.at[na_index, 'word_title']
    # 转为 id 形式
    p = Pool()
    eval_title = np.asarray(p.map(get_id4words, df_eval.word_title.values))
    np.save('../data/wd_eval_title.npy', eval_title)
    eval_content = np.asarray(p.map(get_id4words, df_eval.word_content.values))
    np.save('../data/wd_eval_content.npy', eval_content)
    p.close()
    p.join()
    print('Finished changing the eval words to ids. Costed time %g s' % (time.time() - time0))
def train_word2id():
    """把训练集的所有词转成对应的id。"""
    time0 = time.time()
    print('Processing train data.')
    df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 2, 4],
                           names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
    print('training question number %d ' % len(df_train))
    # 没有 content 的问题用 title 来替换
    na_content_indexs = list()
    for i in tqdm(xrange(len(df_train))):
        word_content = df_train.word_content.values[i]
        if type(word_content) is float:
            na_content_indexs.append(i)
    print('There are %d train questions without content.' % len(na_content_indexs))
    for na_index in tqdm(na_content_indexs):
        df_train.at[na_index, 'word_content'] = df_train.at[na_index, 'word_title']
    # 没有 title 的问题, 丢弃
    na_title_indexs = list()
    for i in xrange(len(df_train)):
        word_title = df_train.word_title.values[i]
        if type(word_title) is float:
            na_title_indexs.append(i)
    print('There are %d train questions without title.' % len(na_title_indexs))
    df_train = df_train.drop(na_title_indexs)
    print('After dropping, training question number(should be 2999952) = %d' % len(df_train))
    # 转为 id 形式
    p = Pool()
    train_title = np.asarray(p.map(get_id4words, df_train.word_title.values))
    np.save('../data/wd_train_title.npy', train_title)
    train_content = np.asarray(p.map(get_id4words, df_train.word_content.values))
    np.save('../data/wd_train_content.npy', train_content)
    p.close()
    p.join()
    print('Finished changing the training words to ids. Costed time %g s' % (time.time() - time0))
Exemple #16
0
    def _run(self, source, destination_format, clear_source=False, workers=-1):
        """
        parallel version of the `convert` method
        :param source: (rdf) files to convert (source path)
        :param destination_format: the destination format
        :param clear_source: if set, delete the source files. Default = False
        :return: None
        """

        files = []
        src = os.path.abspath(source)
        if os.path.isdir(src):
            files = [os.path.join(src, f) for f in os.listdir(src) if to_process(f, destination_format)]
        elif os.path.exists(src):
            files = [src]
        self._log.info('to process: {0}'.format(files))
        if clear_source:
            self._log.warn('will remove original files after conversion')

        def job_finished(res):
            print '.',
            sys.stdout.flush()

        num_cpus = cpu_count()
        num_workers = workers if 0 < workers < num_cpus else num_cpus

        pool = Pool(processes=num_workers)

        for src in files:
            dst = dest_file_name(src, destination_format)
            if dst:
                pool.apply_async(convert_file, (src, dst, clear_source), callback=job_finished)

        pool.close()
        pool.join()
Exemple #17
0
def multi_remote_exec_cmd(hosts, username, cmd):
    pool = Pool(processes=5)
    for host in hosts:
        username, password, ip, port = get_connect_item(username, host)
        pool.apply_async(remote_exec_cmd, (ip, port, username, password, cmd))
    pool.close()
    pool.join()
Exemple #18
0
def _doFastPoW(target, initialHash):
    import shared
    import time
    from multiprocessing import Pool, cpu_count
    try:
        pool_size = cpu_count()
    except:
        pool_size = 4
    try:
        maxCores = config.getint('bitmessagesettings', 'maxcores')
    except:
        maxCores = 99999
    if pool_size > maxCores:
        pool_size = maxCores
    pool = Pool(processes=pool_size)
    result = []
    for i in range(pool_size):
        result.append(pool.apply_async(_pool_worker, args = (i, initialHash, target, pool_size)))
    while True:
        if shared.shutdown:
            pool.terminate()
            while True:
                time.sleep(10) # Don't let this thread return here; it will return nothing and cause an exception in bitmessagemain.py
            return
        for i in range(pool_size):
            if result[i].ready():
                result = result[i].get()
                pool.terminate()
                pool.join() #Wait for the workers to exit...
                return result[0], result[1]
        time.sleep(0.2)
Exemple #19
0
   def get(self, tag="貓咪", max_tag_id=None):
      if tag == "":
         tag = "貓咪"
      p = Pool(10)

      if self.prefix == "ajax":
         medias, next_ = util.search_by_tag(tag, 3, max_tag_id)
      else:
         medias, next_ = util.search_by_tag(tag, 5, max_tag_id)

      fs = p.map(util.features, medias)
      p_label, _, _ = libsvm.svm_predict([1] * len(fs), fs, model)
      for (m, f) in zip(medias, fs):
         print(m["caption"]["text"])
         print(f)
      if self.prefix == "ajax":
         medias = map(lambda (m, l): Media(m, l).__dict__, zip(medias, p_label))
         self.write(json.dumps({
            "max_tag_id": next_,
            "medias": medias
         }))
      else:
         medias = map(lambda (m, l): Media(m, l), zip(medias, p_label))
         if self.prefix == "demo1":
            self.render("demo1.html", medias=medias, tag_name=tag, max_tag_id=next_)
         elif self.prefix == "demo2":
            self.render("demo2.html", medias=medias, tag_name=tag, max_tag_id=next_)
         else:
            self.render("main.html", medias=medias, tag_name=tag, max_tag_id=next_)

      p.close()
      p.join()
def run_make_submission(settings, targets_and_pipelines, split_ratio):
    pool = Pool(settings.N_jobs)
    for i, (target, pipeline, feature_masks, classifier, classifier_name) in enumerate(targets_and_pipelines):
        for j, feature_mask in enumerate(feature_masks):
            progress_str = 'T=%d/%d M=%d/%d' % (i+1, len(targets_and_pipelines), j+1, len(feature_masks))
            pool.apply_async(make_submission_predictions, [settings, target, pipeline, classifier, classifier_name],
                {'feature_mask': feature_mask, 'progress_str': progress_str, 'quiet': True})
    pool.close()
    pool.join()

    guesses = ['clip,preictal']
    num_masks = None
    classifier_names = []
    for target, pipeline, feature_masks, classifier, classifier_name in targets_and_pipelines:
        classifier_names.append(classifier_name)
        if num_masks is None:
            num_masks = len(feature_masks)
        else:
            assert num_masks == len(feature_masks)

        test_predictions = []

        for feature_mask in feature_masks:
            data = make_submission_predictions(settings, target, pipeline, classifier, classifier_name, feature_mask=feature_mask)
            test_predictions.append(data.mean_predictions)

        predictions = np.mean(test_predictions, axis=0)
        guesses += make_csv_for_target_predictions(target, predictions)

    output = '\n'.join(guesses)
    write_submission_file(settings, output, 'ensemble n=%d split_ratio=%s' % (num_masks, split_ratio), None, str(classifier_names), targets_and_pipelines)
Exemple #21
0
def run_train_models(processes, model_library, **kwargs):
    """Train many supervised learning problems in parallel

    model_library = a list specifying the model library for the dataset in
            format needed for TrainModelCV
            **kwargs: all the rest of the input to TrainModelCV"""
    # sample input for model_library:
    #          [[LogisticRegression, classification_error, 'parameters.json', (), {'lam':0.5}],
    #          [LogisticRegression, auc_wmw_fast, None, (), {'C':50}]]

    # use a process pool top execute all the training jobs
    # collect the results and combine to return
    from multiprocessing import Pool

    p = Pool(processes)

    #ret = {}
    #for model in model_library:
    #    p.apply_async(_pool_helper, (model_library, ), kwargs, callback=ret.update)

    results = []
    for model in model_library:
        results.append(p.apply_async(_pool_helper, (model, ), kwargs))

    # wait on the pool to finish
    p.close()
    p.join()

    # collect the results
    ret = {}
    for result in results:
        ret.update(result.get())

    return ret
Exemple #22
0
class JobPool(object):

    """
    Pool container.
    """
    pool = None
    message_queue = None

    def __init__(self, max_instances=4):
        self.message_queue = Queue()
        self.pool = Pool(max_instances, execute_task, (self.message_queue,))
        atexit.register(self.clear)

    def add_analysis(self, analysis):
        """
        Add analysis to the pool.
        """
        analysis.set_started()
        self.message_queue.put(analysis)

    def clear(self):
        """
        Pool cleanup.
        """
        self.pool.terminate()
        self.pool.join()
def enumerate_all_subgraphs_upto_size_k_parallel(document_graph, k, num_of_workers=4):
    """
    returns all subgraphs of a DiscourseDocumentGraph (i.e. a MultiDiGraph)
    with up to k nodes. This is a trivially parallelized version of
    enumerate_all_subgraphs_upto_size_k()
    """
    document_nodes = len(document_graph)
    if k > document_nodes:
        k = document_nodes

    int_graph = nx.convert_node_labels_to_integers(nx.DiGraph(document_graph),
                                                   first_label=1,
                                                   label_attribute='node_id')

    pool = Pool(processes=num_of_workers) # number of CPUs
    results = [pool.apply_async(enumerate_all_size_k_subgraphs, args=(int_graph, i))
                for i in xrange(1, k+1)]
    pool.close()
    pool.join()

    subgraphs = []
    for result in results:
        tmp_result = result.get()
        if isinstance(tmp_result, list):
            subgraphs.extend(tmp_result)
        else:
            subgraphs.append(tmp_result)
    return subgraphs
    def worker(self, db, lista):
        '''
        Metodo per eseguire il processo di ricerca dei plugin in multithread
        Multithread method for online search
        '''

        # Make the Pool of workers
        processes = 20
        #WARNING: con la fibra posso arrivare a 20 senza errori, con adsl massimo 4 worker!
        pool = Pool(processes)

        # Open the urls in their own threads and return the results
        pluglist = pool.map(onlinePluginSearch, lista)

        #close the pool and wait for the work to finish
        pool.close()
        pool.join()

        #parsa il risultato (lista con tuple) e metti tutto in una stringa (result) e aggiorna cache
        result = ''
        for item in pluglist:
            if item[1] !=[]:
                for plug in item[1]:
                    db.updateCache(item[0], plug)
                    result = result + plug + ','

        numbers = result.count(',') + 1
        print("Number of available pflugins: %s" % numbers)
        print("Adding to policy plugins: 19506,10287,12634 for credential checks and ping target.")
        result = result + "19506,10287,12634"
        #aggiungo sempre questi 3 plug-in per verificare se il target e' alive

        return result
def get_classify():
    classify = {"type1": "美食", "type2": "休闲娱乐",  "type4-sub_type1": "酒店",  "type5": "购物",  "type6": "办卡送礼",
                "type7": "旅游",  "type9": "汽车",  "type10": "时尚丽人",  "type11": "生活服务",
                "type4-sub_type2": "出行", "type4-sub_type3": "出行"}
    for name in classify:
        total_num_of_page = get_num_of_page('http://www.rong360.com/credit/f-youhui-' + name)
        print classify[name] + ": " + str(total_num_of_page)
        thread_num = 20  # num of process
        section_size = 50
        section = total_num_of_page / section_size
        if total_num_of_page % section_size > 0:
            section += 1

        for k in range(section):
            begin = k * section_size + 1
            end = begin + section_size - 1
            end = min(end, total_num_of_page)
            print "start to get summary pages from " + str(begin) + " to " + str(end) + \
                  ", each summary page contains 20 detail content pages."
            manager = multiprocessing.Manager()
            queue = manager.Queue()  # a queue storing index of url
            queue.put(begin - 1)  # Initialization of url index

            page_queue = manager.Queue()  # a queue storing end of urls

            # start multiprocess to get urls
            pool = Pool(thread_num)
            for i in range(thread_num):
                pool.apply_async(get_page_url, args=(queue, end, page_queue, '-' + name))
            pool.close()
            pool.join()
            print 'num of total pages: ' + str(page_queue.qsize())

            store_data.insert_column("classify", classify[name], page_queue)
def main(args):
    """
    Main function
    :param args: argparse dict
    :return: None
    """

    print "Start reindexing from {0} to {1} with batch size of {2} and {3} worker processes".format(
        args.source_index, args.destination_index, args.batch_size, args.processes
    )

    client = Elasticsearch()
    print "connected to elastic search at http://localhost:9200"

    docs = scan(client, index=args.source_index)

    count = 0

    queue = Queue(args.batch_size)  # don't fill up queue too much
    pool = Pool(args.processes, worker_main, (queue, args.source_index, args.destination_index, args.batch_size))

    for doc in docs:
        count += 1
        if count % args.batch_size == 0:
            print "put {0}".format(count)
        queue.put(doc, True)
    print "put {0}".format(count)

    # send stop messages
    for i in range(args.processes):
        queue.put(Stop, True)

    pool.close()
    pool.join()
Exemple #27
0
def datasetToNumpy(dataset,sliceSel=None,chunksize=1):
  size = dataset.size
  n = dataset.shape[0]
  if (sliceSel is None): sliceSel = slice(0,n,1)
  isMultiProcessUseful = size> (1024*1024)
  if (chunksize > 1) and isMultiProcessUseful:
    # subdivide indices in chunksize
    start,stop,step = sliceSel.indices(n)
    nC = int(float(stop-start)/step/chunksize+0.5)
    print(nC)
    args = []
    for i in range(nC):
      s1 = start+i*(chunksize*step)
      s2 = start+(i+1)*(chunksize*step)
      print(i,s1,s2)
      args.append( (dataset,slice(s1,s2,step) ) )
    print(args)
    raw_input("Not working yet, use chunksize = 1")
    p = Pool(); # 16-43 ms overhead
    res = p.map_async(f,args,chunksize=1)
    p.close()
    p.join()
    data = np.asarray(res.get())
  else:
    data = dataset[sliceSel]
  return data
Exemple #28
0
class MultiProcessScheduler(LocalScheduler):
    def __init__(self, threads):
        LocalScheduler.__init__(self)
        self.threads = threads
        self.tasks = {}
        from multiprocessing import Pool
        self.pool = Pool(self.threads or 2)

    def start(self):
        pass

    def submitTasks(self, tasks):
        def callback(args):
            logger.debug("got answer: %s", args)
            tid, reason, result, update = args
            task = self.tasks.pop(tid)
            self.taskEnded(task, reason, result, update)

        for task in tasks:
            logger.debug("put task async: %s", task)
            self.tasks[task.id] = task
            self.pool.apply_async(run_task_in_process,
                [task, self.nextAttempId(), env.environ],
                callback=callback)

    def stop(self):
        self.pool.terminate()
        self.pool.join()
        logger.debug("process pool stopped")
    def multiprocessing(self, map_func, callback=None):
        """
        Pass a function to perform on each volume of the feature reader, using
        multiprocessing (map), then process the combined outputs (reduce).

        map_func

        Function to run on each individual volume. Takes as input a tuple
        containing a feature_reader and volume path, from which a volume can be
        created. Returns a (key, value) tuple.

        def do_something_on_vol(args):
            fr, path = args
            vol = fr.create_volume(path)
            # Do something with 'vol'
            return (key, value)

        """
        # Match process count to cpu count
        p = Pool()
        # f = self._wrap_func(func)
        results = p.map(map_func, self._mp_paths(), chunksize=5)
        # , callback=callback)
        p.close()
        p.join()
        return results
Exemple #30
0
class YaraJobPool(object):

    """
    Yara pool container.
    """
    pool = None
    message_queue = None

    def __init__(self, max_instances=3):
        self.message_queue = Queue()
        self.pool = Pool(max_instances, execute_yara_task,
                         (self.message_queue,))
        atexit.register(self.clear)

    def add_yara_task(self, yara_task):
        """
        Adds the yara task.
        """
        self.message_queue.put(yara_task)

    def clear(self):
        """
        Pool cleanup.
        """
        self.pool.terminate()
        self.pool.join()
Exemple #31
0
def _main():
    """Called when the module is executed"""
    def process_reports(reports_):
        output_str = "{0}\n".format(json.dumps(reports_,
                                               ensure_ascii=False,
                                               indent=2))
        if not opts.silent:
            print(output_str)
        if opts.kafka_hosts:
            try:
                ssl_context = None
                if opts.kafka_skip_certificate_verification:
                    logger.debug("Skipping Kafka certificate verification")
                    ssl_context = create_default_context()
                    ssl_context.check_hostname = False
                    ssl_context.verify_mode = CERT_NONE
                kafka_client = kafkaclient.KafkaClient(
                    opts.kafka_hosts,
                    username=opts.kafka_username,
                    password=opts.kafka_password,
                    ssl_context=ssl_context
                )
            except Exception as error_:
                logger.error("Kafka Error: {0}".format(error_.__str__()))
        if opts.save_aggregate:
            for report in reports_["aggregate_reports"]:
                try:
                    if opts.elasticsearch_hosts:
                        shards = opts.elasticsearch_number_of_shards
                        replicas = opts.elasticsearch_number_of_replicas
                        elastic.save_aggregate_report_to_elasticsearch(
                            report,
                            index_suffix=opts.elasticsearch_index_suffix,
                            monthly_indexes=opts.elasticsearch_monthly_indexes,
                            number_of_shards=shards,
                            number_of_replicas=replicas
                        )
                except elastic.AlreadySaved as warning:
                    logger.warning(warning.__str__())
                except elastic.ElasticsearchError as error_:
                    logger.error("Elasticsearch Error: {0}".format(
                        error_.__str__()))
                try:
                    if opts.kafka_hosts:
                        kafka_client.save_aggregate_reports_to_kafka(
                            report, kafka_aggregate_topic)
                except Exception as error_:
                    logger.error("Kafka Error: {0}".format(
                         error_.__str__()))
            if opts.hec:
                try:
                    aggregate_reports_ = reports_["aggregate_reports"]
                    if len(aggregate_reports_) > 0:
                        hec_client.save_aggregate_reports_to_splunk(
                            aggregate_reports_)
                except splunk.SplunkError as e:
                    logger.error("Splunk HEC error: {0}".format(e.__str__()))
        if opts.save_forensic:
            for report in reports_["forensic_reports"]:
                try:
                    shards = opts.elasticsearch_number_of_shards
                    replicas = opts.elasticsearch_number_of_replicas
                    if opts.elasticsearch_hosts:
                        elastic.save_forensic_report_to_elasticsearch(
                            report,
                            index_suffix=opts.elasticsearch_index_suffix,
                            monthly_indexes=opts.elasticsearch_monthly_indexes,
                            number_of_shards=shards,
                            number_of_replicas=replicas)
                except elastic.AlreadySaved as warning:
                    logger.warning(warning.__str__())
                except elastic.ElasticsearchError as error_:
                    logger.error("Elasticsearch Error: {0}".format(
                        error_.__str__()))
                except InvalidDMARCReport as error_:
                    logger.error(error_.__str__())
                try:
                    if opts.kafka_hosts:
                        kafka_client.save_forensic_reports_to_kafka(
                            report, kafka_forensic_topic)
                except Exception as error_:
                    logger.error("Kafka Error: {0}".format(
                        error_.__str__()))
            if opts.hec:
                try:
                    forensic_reports_ = reports_["forensic_reports"]
                    if len(forensic_reports_) > 0:
                        hec_client.save_forensic_reports_to_splunk(
                            forensic_reports_)
                except splunk.SplunkError as e:
                    logger.error("Splunk HEC error: {0}".format(e.__str__()))

    arg_parser = ArgumentParser(description="Parses DMARC reports")
    arg_parser.add_argument("-c", "--config-file",
                            help="a path to a configuration file "
                                 "(--silent implied)")
    arg_parser.add_argument("file_path", nargs="*",
                            help="one or more paths to aggregate or forensic "
                                 "report files, emails, or mbox files'")
    strip_attachment_help = "remove attachment payloads from forensic " \
                            "report output"
    arg_parser.add_argument("--strip-attachment-payloads",
                            help=strip_attachment_help, action="store_true")
    arg_parser.add_argument("-o", "--output",
                            help="write output files to the given directory")
    arg_parser.add_argument("-n", "--nameservers", nargs="+",
                            help="nameservers to query")
    arg_parser.add_argument("-t", "--dns_timeout",
                            help="number of seconds to wait for an answer "
                                 "from DNS (default: 2.0)",
                            type=float,
                            default=2.0)
    arg_parser.add_argument("--offline", action="store_true",
                            help="do not make online queries for geolocation "
                                 " or  DNS")
    arg_parser.add_argument("-s", "--silent", action="store_true",
                            help="only print errors and warnings")
    arg_parser.add_argument("--debug", action="store_true",
                            help="print debugging information")
    arg_parser.add_argument("--log-file", default=None,
                            help="output logging to a file")
    arg_parser.add_argument("-v", "--version", action="version",
                            version=__version__)

    aggregate_reports = []
    forensic_reports = []

    args = arg_parser.parse_args()
    opts = Namespace(file_path=args.file_path,
                     config_file=args.config_file,
                     offline=args.offline,
                     strip_attachment_payloads=args.strip_attachment_payloads,
                     output=args.output,
                     nameservers=args.nameservers,
                     silent=args.silent,
                     dns_timeout=args.dns_timeout,
                     debug=args.debug,
                     save_aggregate=False,
                     save_forensic=False,
                     imap_host=None,
                     imap_skip_certificate_verification=False,
                     imap_ssl=True,
                     imap_port=993,
                     imap_timeout=30,
                     imap_max_retries=4,
                     imap_user=None,
                     imap_password=None,
                     imap_reports_folder="INBOX",
                     imap_archive_folder="Archive",
                     imap_watch=False,
                     imap_delete=False,
                     imap_test=False,
                     hec=None,
                     hec_token=None,
                     hec_index=None,
                     hec_skip_certificate_verification=False,
                     elasticsearch_hosts=None,
                     elasticsearch_timeout=60,
                     elasticsearch_number_of_shards=1,
                     elasticsearch_number_of_replicas=1,
                     elasticsearch_index_suffix=None,
                     elasticsearch_ssl=True,
                     elasticsearch_ssl_cert_path=None,
                     elasticsearch_monthly_indexes=False,
                     elasticsearch_username=None,
                     elasticsearch_password=None,
                     kafka_hosts=None,
                     kafka_username=None,
                     kafka_password=None,
                     kafka_aggregate_topic=None,
                     kafka_forensic_topic=None,
                     kafka_ssl=False,
                     kafka_skip_certificate_verification=False,
                     smtp_host=None,
                     smtp_port=25,
                     smtp_ssl=False,
                     smtp_skip_certificate_verification=False,
                     smtp_user=None,
                     smtp_password=None,
                     smtp_from=None,
                     smtp_to=[],
                     smtp_subject="parsedmarc report",
                     smtp_message="Please see the attached DMARC results.",
                     log_file=args.log_file,
                     n_procs=1,
                     chunk_size=1
                     )
    args = arg_parser.parse_args()

    if args.config_file:
        abs_path = os.path.abspath(args.config_file)
        if not os.path.exists(abs_path):
            logger.error("A file does not exist at {0}".format(abs_path))
            exit(-1)
        opts.silent = True
        config = ConfigParser()
        config.read(args.config_file)
        if "general" in config.sections():
            general_config = config["general"]
            if "offline" in general_config:
                opts.offline = general_config["offline"]
            if "strip_attachment_payloads" in general_config:
                opts.strip_attachment_payloads = general_config[
                    "strip_attachment_payloads"]
            if "output" in general_config:
                opts.output = general_config["output"]
            if "nameservers" in general_config:
                opts.nameservers = _str_to_list(general_config["nameservers"])
            if "dns_timeout" in general_config:
                opts.dns_timeout = general_config.getfloat("dns_timeout")
            if "save_aggregate" in general_config:
                opts.save_aggregate = general_config["save_aggregate"]
            if "save_forensic" in general_config:
                opts.save_forensic = general_config["save_forensic"]
            if "debug" in general_config:
                opts.debug = general_config.getboolean("debug")
            if "silent" in general_config:
                opts.silent = general_config.getboolean("silent")
            if "log_file" in general_config:
                opts.log_file = general_config["log_file"]
            if "n_procs" in general_config:
                opts.n_procs = general_config.getint("n_procs")
            if "chunk_size" in general_config:
                opts.chunk_size = general_config.getint("chunk_size")
        if "imap" in config.sections():
            imap_config = config["imap"]
            if "host" in imap_config:
                opts.imap_host = imap_config["host"]
            else:
                logger.error("host setting missing from the "
                             "imap config section")
                exit(-1)
            if "port" in imap_config:
                opts.imap_port = imap_config.getint("port")
            if "timeout" in imap_config:
                opts.imap_timeout = imap_config.getfloat("timeout")
            if "max_retries" in imap_config:
                opts.imap_max_retries = imap_config.getint("max_retries")
            if "ssl" in imap_config:
                opts.imap_ssl = imap_config.getboolean("ssl")
            if "skip_certificate_verification" in imap_config:
                imap_verify = imap_config.getboolean(
                    "skip_certificate_verification")
                opts.imap_skip_certificate_verification = imap_verify
            if "user" in imap_config:
                opts.imap_user = imap_config["user"]
            else:
                logger.critical("user setting missing from the "
                                "imap config section")
                exit(-1)
            if "password" in imap_config:
                opts.imap_password = imap_config["password"]
            else:
                logger.critical("password setting missing from the "
                                "imap config section")
                exit(-1)

            if "reports_folder" in imap_config:
                opts.imap_reports_folder = imap_config["reports_folder"]
            if "archive_folder" in imap_config:
                opts.imap_archive_folder = imap_config["archive_folder"]
            if "watch" in imap_config:
                opts.imap_watch = imap_config.getboolean("watch")
            if "delete" in imap_config:
                opts.imap_delete = imap_config.getboolean("delete")
            if "test" in imap_config:
                opts.imap_test = imap_config.getboolean("test")
        if "elasticsearch" in config:
            elasticsearch_config = config["elasticsearch"]
            if "hosts" in elasticsearch_config:
                opts.elasticsearch_hosts = _str_to_list(elasticsearch_config[
                    "hosts"])
            else:
                logger.critical("hosts setting missing from the "
                                "elasticsearch config section")
                exit(-1)
            if "timeout" in elasticsearch_config:
                timeout = elasticsearch_config.getfloat("timeout")
                opts.elasticsearch_timeout = timeout
            if "number_of_shards" in elasticsearch_config:
                number_of_shards = elasticsearch_config.getint(
                    "number_of_shards")
                opts.elasticsearch_number_of_shards = number_of_shards
                if "number_of_replicas" in elasticsearch_config:
                    number_of_replicas = elasticsearch_config.getint(
                        "number_of_replicas")
                    opts.elasticsearch_number_of_replicas = number_of_replicas
            if "index_suffix" in elasticsearch_config:
                opts.elasticsearch_index_suffix = elasticsearch_config[
                    "index_suffix"]
            if "monthly_indexes" in elasticsearch_config:
                monthly = elasticsearch_config.getboolean("monthly_indexes")
                opts.elasticsearch_monthly_indexes = monthly
            if "ssl" in elasticsearch_config:
                opts.elasticsearch_ssl = elasticsearch_config.getboolean(
                    "ssl")
            if "cert_path" in elasticsearch_config:
                opts.elasticsearch_ssl_cert_path = elasticsearch_config[
                    "cert_path"]
            if "user" in elasticsearch_config:
                opts.elasticsearch_username = elasticsearch_config[
                    "user"]
            if "password" in elasticsearch_config:
                opts.elasticsearch_password = elasticsearch_config[
                    "password"]
        if "splunk_hec" in config.sections():
            hec_config = config["splunk_hec"]
            if "url" in hec_config:
                opts.hec = hec_config["url"]
            else:
                logger.critical("url setting missing from the "
                                "splunk_hec config section")
                exit(-1)
            if "token" in hec_config:
                opts.hec_token = hec_config["token"]
            else:
                logger.critical("token setting missing from the "
                                "splunk_hec config section")
                exit(-1)
            if "index" in hec_config:
                opts.hec_index = hec_config["index"]
            else:
                logger.critical("index setting missing from the "
                                "splunk_hec config section")
                exit(-1)
            if "skip_certificate_verification" in hec_config:
                opts.hec_skip_certificate_verification = hec_config[
                    "skip_certificate_verification"]
        if "kafka" in config.sections():
            kafka_config = config["kafka"]
            if "hosts" in kafka_config:
                opts.kafka_hosts = _str_to_list(kafka_config["hosts"])
            else:
                logger.critical("hosts setting missing from the "
                                "kafka config section")
                exit(-1)
            if "user" in kafka_config:
                opts.kafka_username = kafka_config["user"]
            else:
                logger.critical("user setting missing from the "
                                "kafka config section")
                exit(-1)
            if "password" in kafka_config:
                opts.kafka_password = kafka_config["password"]
            else:
                logger.critical("password setting missing from the "
                                "kafka config section")
                exit(-1)
            if "ssl" in kafka_config:
                opts.kafka_ssl = kafka_config["ssl"].getboolean()
            if "skip_certificate_verification" in kafka_config:
                kafka_verify = kafka_config.getboolean(
                    "skip_certificate_verification")
                opts.kafka_skip_certificate_verification = kafka_verify
            if "aggregate_topic" in kafka_config:
                opts.kafka_aggregate = kafka_config["aggregate_topic"]
            else:
                logger.critical("aggregate_topic setting missing from the "
                                "kafka config section")
                exit(-1)
            if "forensic_topic" in kafka_config:
                opts.kafka_username = kafka_config["forensic_topic"]
            else:
                logger.critical("forensic_topic setting missing from the "
                                "splunk_hec config section")
        if "smtp" in config.sections():
            smtp_config = config["smtp"]
            if "host" in smtp_config:
                opts.smtp_host = smtp_config["host"]
            else:
                logger.critical("host setting missing from the "
                                "smtp config section")
                exit(-1)
            if "port" in smtp_config:
                opts.smtp_port = smtp_config["port"]
            if "ssl" in smtp_config:
                opts.smtp_ssl = smtp_config.getboolean("ssl")
            if "skip_certificate_verification" in smtp_config:
                smtp_verify = smtp_config.getboolean(
                    "skip_certificate_verification")
                opts.smtp_skip_certificate_verification = smtp_verify
            if "user" in smtp_config:
                opts.smtp_user = smtp_config["user"]
            else:
                logger.critical("user setting missing from the "
                                "smtp config section")
                exit(-1)
            if "password" in smtp_config:
                opts.smtp_password = smtp_config["password"]
            else:
                logger.critical("password setting missing from the "
                                "smtp config section")
                exit(-1)
            if "from" in smtp_config:
                opts.smtp_from = smtp_config["from"]
            else:
                logger.critical("from setting missing from the "
                                "smtp config section")
            if "to" in smtp_config:
                opts.smtp_to = _str_to_list(smtp_config["to"])
            else:
                logger.critical("to setting missing from the "
                                "smtp config section")
            if "subject" in smtp_config:
                opts.smtp_subject = smtp_config["subject"]
            if "attachment" in smtp_config:
                opts.smtp_attachment = smtp_config["attachment"]
            if "message" in smtp_config:
                opts.smtp_message = smtp_config["message"]

    logging.basicConfig(level=logging.WARNING)
    logger.setLevel(logging.WARNING)

    if opts.debug:
        logging.basicConfig(level=logging.DEBUG)
        logger.setLevel(logging.DEBUG)
    if opts.log_file:
        fh = logging.FileHandler(opts.log_file)
        formatter = logging.Formatter(
            '%(asctime)s - '
            '%(levelname)s - [%(filename)s:%(lineno)d] - %(message)s')
        fh.setFormatter(formatter)
        logger.addHandler(fh)
    if opts.imap_host is None and len(opts.file_path) == 0:
        logger.error("You must supply input files, or an IMAP configuration")
        exit(1)

    if opts.save_aggregate or opts.save_forensic:
        try:
            if opts.elasticsearch_hosts:
                es_aggregate_index = "dmarc_aggregate"
                es_forensic_index = "dmarc_forensic"
                if opts.elasticsearch_index_suffix:
                    suffix = opts.elasticsearch_index_suffix
                    es_aggregate_index = "{0}_{1}".format(
                        es_aggregate_index, suffix)
                    es_forensic_index = "{0}_{1}".format(
                        es_forensic_index, suffix)
                elastic.set_hosts(opts.elasticsearch_hosts,
                                  opts.elasticsearch_ssl,
                                  opts.elasticsearch_ssl_cert_path,
                                  opts.elasticsearch_username,
                                  opts.elasticsearch_password,
                                  timeout=opts.elasticsearch_timeout)
                elastic.migrate_indexes(aggregate_indexes=[es_aggregate_index],
                                        forensic_indexes=[es_forensic_index])
        except elastic.ElasticsearchError as error:
            logger.error("Elasticsearch Error: {0}".format(error.__str__()))
            exit(1)

    if opts.hec:
        if opts.hec_token is None or opts.hec_index is None:
            logger.error("HEC token and HEC index are required when "
                         "using HEC URL")
            exit(1)

        verify = True
        if opts.hec_skip_certificate_verification:
            verify = False
        hec_client = splunk.HECClient(opts.hec, opts.hec_token,
                                      opts.hec_index,
                                      verify=verify)

    kafka_aggregate_topic = opts.kafka_aggregate_topic
    kafka_forensic_topic = opts.kafka_forensic_topic

    file_paths = []
    mbox_paths = []

    for file_path in args.file_path:
        file_paths += glob(file_path)
    for file_path in file_paths:
        if is_mbox(file_path):
            mbox_paths.append(file_path)

    file_paths = list(set(file_paths))
    mbox_paths = list(set(mbox_paths))

    for mbox_path in mbox_paths:
        file_paths.remove(mbox_path)

    counter = Value('i', 0)
    pool = Pool(opts.n_procs, initializer=init, initargs=(counter,))
    results = pool.starmap_async(cli_parse,
                                 zip(file_paths,
                                     repeat(opts.strip_attachment_payloads),
                                     repeat(opts.nameservers),
                                     repeat(opts.dns_timeout),
                                     repeat(opts.offline),
                                     repeat(opts.n_procs >= 1)),
                                 opts.chunk_size)
    pbar = tqdm(total=len(file_paths))
    while not results.ready():
        pbar.update(counter.value - pbar.n)
        time.sleep(0.1)
    pbar.close()
    results = results.get()
    pool.close()
    pool.join()

    for result in results:
        if type(result[0]) is InvalidDMARCReport:
            logger.error("Failed to parse {0} - {1}".format(result[1],
                                                            result[0]))
        else:
            if result[0]["report_type"] == "aggregate":
                aggregate_reports.append(result[0]["report"])
            elif result[0]["report_type"] == "forensic":
                forensic_reports.append(result[0]["report"])

    for mbox_path in mbox_paths:
        reports = get_dmarc_reports_from_mbox(mbox_path, opts.nameservers,
                                              opts.dns_timeout,
                                              opts.strip_attachment_payloads,
                                              opts.offline, False)
        aggregate_reports += reports["aggregate_reports"]
        forensic_reports += reports["forensic_reports"]

    if opts.imap_host:
        try:
            if opts.imap_user is None or opts.imap_password is None:
                logger.error("IMAP user and password must be specified if"
                             "host is specified")

            rf = opts.imap_reports_folder
            af = opts.imap_archive_folder
            ns = opts.nameservers
            sa = opts.strip_attachment_payloads
            ssl = True
            verify = True
            if opts.imap_skip_certificate_verification:
                logger.debug("Skipping IMAP certificate verification")
                verify = False
            if opts.imap_ssl is False:
                ssl = False
            reports = get_dmarc_reports_from_inbox(
                host=opts.imap_host,
                port=opts.imap_port,
                ssl=ssl,
                verify=verify,
                timeout=opts.imap_timeout,
                max_retries=opts.imap_max_retries,
                user=opts.imap_user,
                password=opts.imap_password,
                reports_folder=rf,
                archive_folder=af,
                delete=opts.imap_delete,
                offline=opts.offline,
                nameservers=ns,
                test=opts.imap_test,
                strip_attachment_payloads=sa
                                                   )

            aggregate_reports += reports["aggregate_reports"]
            forensic_reports += reports["forensic_reports"]

        except Exception as error:
            logger.error("IMAP Error: {0}".format(error.__str__()))
            exit(1)

    results = OrderedDict([("aggregate_reports", aggregate_reports),
                           ("forensic_reports", forensic_reports)])

    if opts.output:
        save_output(results, output_directory=opts.output)

    process_reports(results)

    if opts.smtp_host:
        try:
            verify = True
            if opts.smtp_skip_certificate_verification:
                verify = False
            email_results(results, opts.smtp_host, opts.smtp_from,
                          opts.smtp_to, port=opts.smtp_port, verify=verify,
                          username=opts.smtp_user,
                          password=opts.smtp_password,
                          subject=opts.smtp_subject)
        except Exception as error:
            logger.error("{0}".format(error.__str__()))
            exit(1)

    if opts.imap_host and opts.imap_watch:
        logger.info("Watching for email - Quit with ctrl-c")
        ssl = True
        verify = True
        if opts.imap_skip_certificate_verification:
            logger.debug("Skipping IMAP certificate verification")
            verify = False
        if opts.imap_ssl is False:
            ssl = False
        try:
            sa = opts.strip_attachment_payloads
            watch_inbox(
                opts.imap_host,
                opts.imap_user,
                opts.imap_password,
                process_reports,
                port=opts.imap_port,
                ssl=ssl,
                verify=verify,
                reports_folder=opts.imap_reports_folder,
                archive_folder=opts.imap_archive_folder,
                delete=opts.imap_delete,
                test=opts.imap_test,
                nameservers=opts.nameservers,
                dns_timeout=opts.dns_timeout,
                strip_attachment_payloads=sa)
        except FileExistsError as error:
            logger.error("{0}".format(error.__str__()))
            exit(1)
Exemple #32
0
    def multipart_upload(self,
                         key_name,
                         source_path,
                         acl=None,
                         metadata={},
                         mimetype=None,
                         headers={},
                         cb=None,
                         num_cb=None):
        try:
            # multipart portions copyright Fabian Topfstedt
            # https://pypi.python.org/pypi/filechunkio/1.5

            import math
            import mimetypes
            from multiprocessing import Pool
            from sinastorage.vendored.filechunkio import FileChunkIO
            multipart_capable = True
            parallel_processes = 4
            min_bytes_per_chunk = 5 * 1024 * 1024  #每片分片最大文件大小
            usage_flag_multipart_capable = """ [--multipart]"""
            usage_string_multipart_capable = """
                multipart - Upload files as multiple parts. This needs filechunkio.
                            Requires ListBucket, ListMultipartUploadParts,
                            ListBucketMultipartUploads and PutObject permissions."""
        except ImportError as err:
            multipart_capable = False
            usage_flag_multipart_capable = ""
            usage_string_multipart_capable = '\n\n     "' + \
                err.message[len('No module named '):] + \
                '" is missing for multipart support '

            raise err
        """
        Parallel multipart upload.
        """
        multipart = self.initiate_multipart_upload(key_name, acl, metadata,
                                                   mimetype, headers)
        source_size = getSize(source_path)
        bytes_per_chunk = max(
            int(math.sqrt(min_bytes_per_chunk) * math.sqrt(source_size)),
            min_bytes_per_chunk)
        chunk_amount = int(math.ceil(source_size / float(bytes_per_chunk)))
        multipart.bytes_per_part = bytes_per_chunk
        multipart.parts_amount = chunk_amount

        pool = Pool(processes=parallel_processes)
        i = 0
        for part in multipart.get_next_part():
            offset = i * bytes_per_chunk
            remaining_bytes = source_size - offset
            chunk_bytes = min([bytes_per_chunk, remaining_bytes])
            pool.apply_async(
                func=_upload_part,
                args=(
                    self.name,
                    key_name,
                    multipart.upload_id,
                    multipart.parts_amount,
                    part,
                    source_path,
                    offset,
                    chunk_bytes,
                    cb,
                    num_cb,
                ),
                callback=lambda part: multipart.parts.append(part))
            #             partResult = _upload_part(bucketName, key_name, multipart.upload_id, multipart.parts_amount, part, source_path, offset, chunk_bytes,
            #                                             cb, num_cb)

            #             multipart.parts.append(partResult)

            i = i + 1

        pool.close()
        pool.join()

        if len(multipart.parts) == chunk_amount:
            self.complete_multipart_upload(multipart)


#             multipart.complete_upload()
#             key = bucket.get_key(keyname)
#             key.set_acl(acl)
        else:
            #             mp.cancel_upload()
            #             print  len(multipart.parts) , chunk_amount
            six.print_(len(multipart.parts), chunk_amount)

            raise RuntimeError("multipart upload is failed!!")
Exemple #33
0
# releaseDirs = ["vc/1/","vc/2/","vc/3/","vc/4/"]


def runTest(gpu):
    run = str(gpu + 1)
    relDir = basePath + run + "Release/"
    if not os.path.isdir(relDir):
        print(relDir)
        return
    # os.system('python3 testNetworksOnFlow.py '+relDir+" "+mType)
    # os.system('CUDA_VISIBLE_DEVICES='+str(gpu)+' python3 testNetworksOnFlow.py '+relDir+" "+mType)
    os.system('CUDA_VISIBLE_DEVICES=' + str(gpu) +
              ' python3 testNetworks.py ' + relDir + " " + mType)


runs = [i for i in range(4)]
p = Pool(4)
res = p.map(runTest, runs)
p.close()
p.join()

# for mType in modelTypes:
# 	for run in range(numRuns):
# 		# relDir = basePath+mType+"/"+str(run+1)+"/"
# 		relDir = basePath+str(run+1)+"Release/"

# 		if not os.path.isdir(relDir):
# 			print(relDir)
# 			continue
# 		os.system('CUDA_VISIBLE_DEVICES='+gpu+' python3 testNetworks.py '+relDir+" "+mType)
# 		# os.system('python3 testNetworks.py '+relDir+" "+mType)
Exemple #34
0
def sample_composite(convResults, num_sources, p_args):
    dataDict = defaultdict(list)

    for i in range(len(convResults)):
        for j in range(len(convResults[i])):
            data_name = convResults[i][j][0]
            if p_args.debug: print "Processing data pack ", data_name
            dataDict[data_name].append(convResults[i][j][1])
            #templist = dataDict.get(data_name)
            # for k,data_test in enumerate(templist):
            #     print "added ", data_test[0], " to ", data_name
    scomp_pool = Pool(processes=cores)
    try:
        if num_sources > 1:

            for w, currentPair in enumerate(dataDict):
                samplelist = dataDict.get(currentPair)
                sorted_results = sorted(samplelist, key=lambda x: (x[0]))

                num_samples = len(samplelist)
                cols = int(min(num_samples, 4))
                rows = int(max(int(math.ceil(num_samples / float(cols))), 1))
                print "Number of rows and columns for the composite plots (samples): ", rows, cols

                dim1 = ""
                dim2 = ""
                headerstr = str(currentPair).split("_vs_")
                if len(headerstr) > 1:
                    dim1 = str(headerstr[0])
                    dim2 = str(headerstr[1])

                print "Processing composite for " + currentPair
                scomp_pool.apply_async(compose_from_fig,
                                       args=[
                                           sorted_results, rows, cols,
                                           num_samples, currentPair, dim1,
                                           dim2, 0, p_args
                                       ])
            scomp_pool.close()
            scomp_pool.join()
        else:
            #procedure for composing all populations of a single sample
            composite_list = []
            sample_name = ""
            dictKeys = dataDict.keys()
            dictKeys.sort(key=natural_sort_key)
            print "sorted keys: ", dictKeys
            for w, key in enumerate(dictKeys):
                samplelist = dataDict.get(key)
                if w == 0: sample_name = samplelist[0][0]
                samplelist[0][0] = key  #replace file name with population name
                composite_list = composite_list + samplelist
            #sorted_results = sorted(composite_list,key=lambda x: (x[0]))
            num_figs = len(composite_list)
            cols = int(min(num_figs, 4))
            rows = int(max(int(math.ceil(num_figs / float(cols))), 1))
            print "Number of rows and columns for the composite plots (samples): ", rows, cols
            compose_from_fig(composite_list, rows, cols, num_figs, sample_name,
                             "", "", 0, p_args)
        plt.close("all")
    except KeyboardInterrupt:
        scomp_pool.terminate()
        scomp_pool.join()
        sys.exit(1)
    except Exception, e:
        scomp_pool.terminate()
        scomp_pool.join()
        print >> sys.stderr, "Exception: %s" % str(e)
        raise Exception("".join(traceback.format_exception(*sys.exc_info())))
Exemple #35
0
def autoconfig_processfile2(name, pool_used, f_index, args):
    try:
        with open(
                name
        ) as result_file:  # No need to specify 'r': this is the default.as

            gates = args.config
            num_gates = len(gates)
            nameParts = name.split("/")

            if len(nameParts) > 1:
                originalName = nameParts[len(nameParts) - 2]
            else:
                if args.name is not None:
                    originalName = args.name + str(f_index)
                else:
                    originalName = "NoNameSample" + str(f_index)
            print "Processing ", (originalName)

            colorlist = args.colorlist

            events = sum(1 for line in
                         result_file) - 1  #quickly determine number of events
            result_file.seek(0)  #rewind to the beginning of file
            header = result_file.readline()
            header = header.strip()
            headers = header.split(
                "\t")  #parse the headers from the first line of input
            headers = filter(None, headers)
            num_markers = len(headers) - 2

            # create a numpy array for faster data access
            if args.debug:
                print "Assigning data to numpy matrix"
                print "header length: ", len(headers)
                print headers
                print "events count: ", events
            fcm = loadNp(result_file, len(headers), events)

            # find the start of pop info on fcs_results_all
            if args.flocklegacy is False:
                pop_offset = 0
                for i, header in enumerate(headers):
                    if header == "pop1":
                        pop_offset = i - 1
                if args.debug: print "Pop offset: ", pop_offset

            axis_popIndexDict = defaultdict(list)

            print "Configuring axises from gate configuration file"
            axises = []
            composite_axis = 0
            last_xmarker = ""
            last_ymarker = ""
            last_parent = 0
            for pop, config in gates.items():
                #pop="pop"+str(i+1)
                #config=gates.get(pop)

                xmarker = str(headers[config[1] - 1])
                ymarker = str(headers[config[2] - 1])
                startx = int((float(config[3]) / 200) * 4096)
                starty = int((float(config[5]) / 200) * 4096)
                endx = int((float(config[4]) / 200) * 4096)
                endy = int((float(config[6]) / 200) * 4096)
                parent = int(config[7])
                key = "axis" + str(composite_axis)
                if (xmarker != last_xmarker) or (ymarker != last_ymarker) or (
                        parent != last_parent):
                    composite_axis = composite_axis + 1
                    key = "axis" + str(composite_axis)
                    axises.append([xmarker, ymarker, key])
                axis_popIndexDict[key].append(pop)
                last_xmarker = xmarker
                last_ymarker = ymarker
                last_parent = parent

            num_axises = len(axises)
            #print "axis_popIndexDict: ", axis_popIndexDict
            #print axises

            cols = int(min(num_axises, 4))
            rows = int(max(int(math.ceil(num_axises / float(cols))), 1))

            print "Iterating through feature pairs"
            sub_results = []
            composite_list = []
            if pool_used == 0: inner_pool = Pool(processes=cores)
            for w, mpair in enumerate(axises):
                lines = []
                dim1 = mpair[0]
                dim2 = mpair[1]
                #print dim1, dim2
                dim1_idx = 0
                dim2_idx = 0
                for i, marker in enumerate(headers):
                    if marker == dim1:
                        dim1_idx = i
                        print("Feature 1: ", marker, i + 1)
                    if marker == dim2:
                        dim2_idx = i
                        print("Feature 2: ", marker, i + 1)

                header_names = dim1 + "_vs_" + dim2
                axis_name = mpair[2]
                print header_names, axis_name

                if args.flocklegacy is False:
                    fcm[:, -1] = 0  #reset color mapping in np matrix
                    pops = axis_popIndexDict.get(axis_name)
                    if args.debug:
                        print "iterate through events to find population members"
                    poplist_colors = []
                    for i, pop in enumerate(pops):
                        config = gates.get(pop)
                        found_pop = config[0]
                        xmarker = str(headers[config[1] - 1])
                        ymarker = str(headers[config[2] - 1])
                        startx = int((float(config[3]) / 200) * 4096)
                        starty = int((float(config[5]) / 200) * 4096)
                        endx = int((float(config[4]) / 200) * 4096)
                        endy = int((float(config[6]) / 200) * 4096)
                        if i == 0: parent_gate = int(config[7])
                        cluster_type = int(config[8])
                        pop_loc = found_pop + pop_offset
                        parent_poploc = parent_gate + pop_offset

                        if (xmarker == dim1) and (ymarker == dim2):
                            if cluster_type == 2:
                                print "slanted"
                            else:
                                lines.append([(startx, endx), (starty, starty),
                                              colorlist[i + 2]])
                                lines.append([(startx, startx), (starty, endy),
                                              colorlist[i + 2]])
                                lines.append([(startx, endx), (endy, endy),
                                              colorlist[i + 2]])
                                lines.append([(endx, endx), (starty, endy),
                                              colorlist[i + 2]])
                        elif (xmarker == dim2) and (ymarker == dim1):
                            if cluster_type == 2:
                                print "slanted"
                            else:
                                lines.append([(starty, endy), (startx, startx),
                                              colorlist[i + 2]])
                                lines.append([(starty, starty), (startx, endx),
                                              colorlist[i + 2]])
                                lines.append([(starty, endy), (endx, endx),
                                              colorlist[i + 2]])
                                lines.append([(endy, endy), (startx, endx),
                                              colorlist[i + 2]])

                        if args.showparent and (config[0] > 1) and i == 0:
                            fcm[:, -1] = (1 - fcm[:, pop_loc]) + (
                                1 - fcm[:, parent_poploc])
                        else:
                            fcm[:, -1] = np.maximum(
                                fcm[:, -1], (1 - fcm[:, pop_loc]) * (i + 2))

                if args.debug: print "sorting numpy array"
                if args.sort:
                    sfcm = fcm[np.argsort(
                        fcm[:, -1]
                    )]  #sort the data set based on population number
                else:
                    sfcm = fcm

                if args.reversesort:
                    sfcm = sfcm[::-1]

                print "creating color array"
                cdata = []
                for a in sfcm[:, -1]:
                    try:
                        cdata.append(colorlist[a])
                    except Exception, err:
                        print "ERROR in index: ", a
                        sys.stderr.write('Error: %sn' % str(err))
                        return 1

                xdata = sfcm[:, dim1_idx]
                ydata = sfcm[:, dim2_idx]
                sample_name = originalName

                poplist = []
                if args.flocklegacy is False:
                    parent_pop_name = "pop" + str(parent_gate)
                    if args.showparent:
                        poplist.append(parent_pop_name)
                    else:
                        poplist.append("")
                    poplist = poplist + pops

                    pop_name = ""
                    if args.showparent:
                        for i, pop in enumerate(poplist):
                            if i > 0: pop_name = pop_name + pop
                    else:
                        for i, pop in enumerate(poplist):
                            pop_name = pop_name + pop
                else:
                    pop_name = "FLOCK"

                png_file = sample_name + "_" + header_names + "_" + pop_name + ".png"
                if pool_used == 0:
                    inner_pool.apply_async(plotfig,
                                           args=[
                                               sample_name, xdata, ydata,
                                               cdata, dim1, dim2, poplist,
                                               lines, args
                                           ])
                else:
                    png_file = plotfig(sample_name, xdata, ydata, cdata, dim1,
                                       dim2, poplist, lines, args)
                sub_results.append([axis_name, [sample_name, png_file]])
                composite_list.append([axis_name, png_file])
            if pool_used == 0:
                inner_pool.close()
                inner_pool.join()
            if args.gatescomposite:
                print "Number of rows and columns for the composite plots (gates): ", rows, cols
                compose_from_fig(composite_list, rows, cols, num_axises,
                                 sample_name, " ", " ", 0, args)

            return sub_results
    except IOError as exc:
        if exc.errno != errno.EISDIR:  # Do not fail if a directory is found, just ignore it.
            raise  # Propagate other kinds of IOError.args
    except Exception, e:
        print >> sys.stderr, "Exception: %s" % str(e)
        raise Exception("".join(traceback.format_exception(*sys.exc_info())))
Exemple #36
0
def autoconfig_processfile(name, pool_used, f_index, args):
    try:
        with open(
                name
        ) as result_file:  # No need to specify 'r': this is the default.as

            gates = args.config
            if (gates is not None):
                num_gates = len(gates)
            else:
                num_gates = 0

            nameParts = name.split("/")

            if len(nameParts) > 1:
                originalName = nameParts[len(nameParts) - 2]
            else:
                if args.name is not None:
                    originalName = args.name + str(f_index)
                else:
                    originalName = "NoNameSample" + str(f_index)
            print "Processing ", (originalName)

            colorlist = args.colorlist

            events = sum(1 for line in
                         result_file) - 1  #quickly determine number of events
            result_file.seek(0)  #rewind to the beginning of file
            header = result_file.readline()
            header = header.strip()
            headers = header.split(
                "\t")  #parse the headers from the first line of input
            num_markers = len(headers) - 2

            # create a numpy array for faster data access
            if args.debug: print "Assigning data to numpy matrix"
            fcm = loadNp(result_file, len(headers), events)

            # find the start of pop info on fcs_results_all
            if args.flocklegacy is False:
                pop_offset = 0
                for i, header in enumerate(headers):
                    if header == "pop1":
                        pop_offset = i - 1
                if args.debug: print "Pop offset: ", pop_offset

            # parsing gates from configuration data
            if args.debug: print "Configuring gates from file"
            sub_results = []
            if pool_used == 0: inner_pool = Pool(processes=cores)
            for gate, config in gates.iteritems():
                xmarker = str(headers[config[1] - 1])
                ymarker = str(headers[config[2] - 1])
                startx = int((float(config[3]) / 200) * 4096)
                starty = int((float(config[5]) / 200) * 4096)
                endx = int((float(config[4]) / 200) * 4096)
                endy = int((float(config[6]) / 200) * 4096)
                parent_gate = int(config[7])

                lines = []
                cluster_type = int(config[8])
                if cluster_type == 2:
                    print "slanted"
                else:
                    lines.append([(startx, endx), (starty, starty),
                                  colorlist[1]])
                    lines.append([(startx, startx), (starty, endy),
                                  colorlist[1]])
                    lines.append([(startx, endx), (endy, endy), colorlist[1]])
                    lines.append([(endx, endx), (starty, endy), colorlist[1]])

                key = xmarker + "_vs_" + ymarker

                dim1 = xmarker
                dim2 = ymarker
                print dim1, dim2
                dim1_idx = 0
                dim2_idx = 0

                for i, marker in enumerate(headers):
                    if marker == dim1:
                        dim1_idx = i
                        print("Feature 1: ", marker, i + 1)
                    if marker == dim2:
                        dim2_idx = i
                        print("Feature 2: ", marker, i + 1)

                header_names = key
                print header_names

                if args.flocklegacy is False:
                    poploc = int(config[0]) + pop_offset
                    parent_poploc = parent_gate + pop_offset
                    print "gate location: ", poploc
                    print "parent gate location: ", parent_poploc
                    if args.debug:
                        print "iterate through events to find population members"
                    fcm[:, -1] = 0
                    if args.showparent and (config[0] > 1):
                        fcm[:, -1] = (1 - fcm[:, poploc]) + (
                            1 - fcm[:, parent_poploc])
                    else:
                        fcm[:, -1] = (2 - 2 * fcm[:, poploc])

                if args.debug: print "sorting numpy array"
                if args.sort:
                    sfcm = fcm[np.argsort(
                        fcm[:, -1]
                    )]  #sort the data set based on population number
                else:
                    sfcm = fcm

                if args.reversesort:
                    sfcm = sfcm[::-1]

                #print sfcm[0, :]
                if args.debug: print "creating color array"
                cdata = []
                for a in sfcm[:, -1]:
                    cdata.append(colorlist[a])

                xdata = sfcm[:, dim1_idx]
                ydata = sfcm[:, dim2_idx]
                sample_name = originalName

                poplist = []
                if args.flocklegacy is False:
                    pop_name = gate
                    parent_pop_name = "pop" + str(parent_gate)
                    if args.showparent: poplist.append(parent_pop_name)
                    poplist.append(pop_name)
                else:
                    pop_name = "FLOCK"

                png_file = sample_name + "_" + header_names + "_" + pop_name + ".png"

                if pool_used == 0:
                    inner_pool.apply_async(plotfig,
                                           args=[
                                               sample_name, xdata, ydata,
                                               cdata, xmarker, ymarker,
                                               poplist, lines, args
                                           ])
                else:
                    plotfig(sample_name, xdata, ydata, cdata, xmarker, ymarker,
                            poplist, lines, args)
                if args.flocklegacy is False:
                    sub_results.append([pop_name, [sample_name, png_file]])
                else:
                    sub_results.append(
                        [pop_name + str(gate), [sample_name, png_file]])
            if pool_used == 0:
                inner_pool.close()
                inner_pool.join()
            return sub_results
    except IOError as exc:
        if exc.errno != errno.EISDIR:  # Do not fail if a directory is found, just ignore it.
            raise  # Propagate other kinds of IOError.args
    except:
        #print >> sys.stderr, "Exception: %s" % str(e)
        raise Exception("".join(traceback.format_exception(*sys.exc_info())))
Exemple #37
0
                        and (p_args.flocklegacy is False)):
                    if p_args.showmultigates:
                        proc = file_pool.apply_async(
                            autoconfig_processfile2,
                            args=[name, min(24, num_files), i, p_args])
                    else:
                        proc = file_pool.apply_async(
                            autoconfig_processfile,
                            args=[name, min(24, num_files), i, p_args])
                else:
                    proc = file_pool.apply_async(
                        processfile,
                        args=[name, min(24, num_files), i, p_args])
                results.append(proc)
            file_pool.close()
            file_pool.join()
        except KeyboardInterrupt:
            file_pool.terminate()
            file_pool.join()
            sys.exit(1)
        except Exception, e:
            file_pool.terminate()
            file_pool.join()
            print >> sys.stderr, "Exception: %s" % str(e)
            raise Exception("".join(
                traceback.format_exception(*sys.exc_info())))

        plt.close("all")

        #generating composite at samples level
        if p_args.samplescomposite:
Exemple #38
0
# datas = re.findall('<dl class="row2" id="tuijian">([a-zA-Z<>]*)</dl>',response.data)
# html = html.fromstring(response.data)
# print(html)
element = etree.HTML(response.data)
eles = element.xpath(
    '//a[@class="addcss_a"]/img/@src|//a[@class="addcss_a"]/img/@title')


# print(eles)
# 定义函数完成图片数据的下载操作
def download(url, file_name, header):
    response = http.request('get', url, headers=header)
    os.mkdir('img/{0}'.format(file_name), 755)
    img_name = url.split("-")[-1]
    f = open('img/{0}/{1}'.format(file_name, img_name), "wb+")
    f.write(response.data)
    f.close()
    print("下载中")
    time.sleep(1)


if __name__ == "__main__":
    # eles = get_img_data(http)

    pool = Pool(5)
    for i in range(len(eles)):
        if i % 2 == 0:
            pool.apply_async(download, args=(eles[i], eles[i + 1], header))
    pool.close()
    pool.join()
    print("下载任务结束")
Exemple #39
0
def extract_features(timeseries_container, feature_extraction_settings=None,
                     column_id=None, column_sort=None, column_kind=None, column_value=None):
    """
    Extract features from

    * a :class:`pandas.DataFrame` containing the different time series

    or

    * a dictionary of :class:`pandas.DataFrame` each containing one type of time series

    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.

    For a list of all the calculated time series features, please see the
    :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class,
    which is used to control which features with which parameters are calculated.

    For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features
    >>> df, _ = load_robot_execution_failures()
    >>> X = extract_features(df, column_id='id', column_sort='time')

    which would give the same results as described above. In this case, the column_kind is not allowed.
    Except that, the same rules for leaving out the columns apply as above.

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
    :type timeseries_container: pandas.DataFrame or dict

    :param column_id: The name of the id column to group by.
    :type column_id: str
    :param column_sort: The name of the sort column.
    :type column_sort: str
    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str
    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param feature_extraction_settings: settings object that controls which features are calculated
    :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings

    :return: The (maybe imputed) DataFrame with the extracted features.
    :rtype: pandas.DataFrame
    """

    # Always use the standardized way of storing the data.
    # See the function normalize_input_to_internal_representation for more information.
    kind_to_df_map, column_id, column_value = \
        dataframe_functions.normalize_input_to_internal_representation(timeseries_container, column_id, column_sort,
                                                                       column_kind, column_value)

    # Use the standard setting if the user did not supply ones himself.
    if feature_extraction_settings is None:
        feature_extraction_settings = FeatureExtractionSettings()
        for key in kind_to_df_map:
            feature_extraction_settings.set_default_parameters(key)

    # If requested, do profiling (advanced feature)
    if feature_extraction_settings.PROFILING:
        profiler = profiling.start_profiling()

    # Extract the time series features for every type of time series and concatenate them together.
    all_possible_unique_id_values = set(id_value for kind, df in kind_to_df_map.items()
                                        for id_value in df[column_id])
    df_with_ids = pd.DataFrame(index=all_possible_unique_id_values)

    pool = Pool(feature_extraction_settings.n_processes)
    partial_extract_features_for_one_time_series = partial(_extract_features_for_one_time_series, column_id=column_id,
                              column_value=column_value, settings=feature_extraction_settings)
    extracted_features = pool.map(partial_extract_features_for_one_time_series, kind_to_df_map.items())

    # Add time series features to result
    result = pd.concat([df_with_ids] + extracted_features, axis=1, join='outer', join_axes=[df_with_ids.index])\
        .astype(np.float64)

    # Impute the result if requested
    if feature_extraction_settings.IMPUTE is not None:
        feature_extraction_settings.IMPUTE(result)

    # Turn off profiling if it was turned on
    if feature_extraction_settings.PROFILING:
        profiling.end_profiling(profiler, filename=feature_extraction_settings.PROFILING_FILENAME,
                                sorting=feature_extraction_settings.PROFILING_SORTING)

    pool.close()
    pool.join()

    return result
Exemple #40
0
class ceph:
    def __init__(self, filename=None, keyname=None, public=True):
        self.__connect()
        self.bucket = self.__get_bucket()
        self.uploads = collections.OrderedDict()
        self.pool = Pool(processes=24)

        if filename is not None and keyname is not None:
            self.upload(filename, keyname, public=public)
            self.sync()
        pass

    def __del__(self):
        try:
            self.pool.close()
            self.pool.join()
        except:
            pass

    def __connect(self):
        self.conn = boto.connect_s3(
            aws_access_key_id=config.key,
            aws_secret_access_key=config.secret,
            host=config.ceph_fqdn,
            is_secure=True,
            calling_format=boto.s3.connection.OrdinaryCallingFormat(),
        )

    def __create_bucket(self):
        self.bucket = self.conn.create_bucket(config.bucket)
        from boto.s3.cors import CORSConfiguration
        cors_cfg = CORSConfiguration()
        cors_cfg.add_rule([
            'GET',
        ], '*', allowed_header='*')
        cors_cfg.add_rule('GET', '*')
        self.bucket.set_cors(cors_cfg)

    def __get_bucket(self):
        try:
            self.bucket = self.conn.get_bucket(config.bucket)
        except boto.exception.S3ResponseError:
            self.__create_bucket()
        return self.bucket

    def delete(self, key):
        self.bucket.delete_key(key)

    def set_public(self, key):
        log.log('setting key %s as public-read' % (key, ))
        key = self.bucket.get_key(key)
        key.set_canned_acl('public-read')

    def init_mp(self, keyname):
        self.bucket.delete_key(keyname)
        return self.bucket.initiate_multipart_upload(keyname)

    def upload(self, filename, keyname, callback=None, args=None, public=True):
        log.log('uploading %s as %s' % (filename, keyname))
        self.sync(True)

        source_size = os.stat(filename).st_size
        chunk_size = 41943040  #same as rgw strip size
        chunk_count = int(math.ceil(source_size / float(chunk_size)))

        if chunk_count > 1:
            mp = self.init_mp(keyname)
            offlist = []

            for i in range(chunk_count):
                offset = chunk_size * i
                bytes = min(chunk_size, source_size - offset)
                offlist.append({
                    'offset': offset,
                    'bytes': bytes,
                    'i': i,
                    'path': filename,
                    'mp': mp
                })

            map_async = self.pool.map_async(upload_part, offlist)
        else:
            mp = None
            self.bucket.delete_key(keyname)
            key = self.bucket.new_key(keyname)

            data = {'path': filename, 'key': key}
            map_async = self.pool.map_async(upload_file, [data])

        try:
            map_async.get(0.001)
        except multiprocessing.context.TimeoutError:
            pass
        self.uploads[keyname] = {
            'mp': mp,
            'callback': callback,
            'args': args,
            'map_async': map_async,
            'completed': False,
            'public': public,
        }

    def end_upload(self, keyname):
        item = self.uploads[keyname]
        if item['mp'] is not None:
            log.log('ending mp: %s' % (keyname))
            item['mp'].complete_upload()

        if item['public'] is True:
            self.set_public(keyname)
        if item['callback'] is not None:
            item['callback'](item['args'])

    #you MUST NOT use this object after sync(lazy=False)
    def sync(self, lazy=False):
        if lazy is True:
            for keyname in self.uploads:
                item = self.uploads[keyname]
                if item['completed'] is True:
                    continue
                try:
                    self.uploads[keyname]['map_async'].get(0.001)
                    self.uploads[keyname]['completed'] = True
                except multiprocessing.context.TimeoutError:
                    continue
                self.end_upload(keyname)
            return
        self.pool.close()
        for keyname in self.uploads:
            item = self.uploads[keyname]
            if item['completed'] is True:
                continue
            self.uploads[keyname]['map_async'].get()
            self.end_upload(keyname)
        self.pool.join()
        'rct-', 'sdmu-', 'shkd-', 'snis-', 'sprd-', 'vagu-', 'vec-', 'vrtm-',
        'wanz-', 'cheat', 'next', 'cuckold', 'son', 'teacher', 'neighbor',
        'friend', 'incest', 'game', 'pantyhose', 'father', 'dad', 'boss',
        'fera-', 'dvdes-', 'dasd', 'rape', 'cesd', 'dandy', 'iqqq', 'mrss',
        'mdyd', 'brother', 'husband', 'silent', 'bride', 'kiss', 'daughter',
        'immoral', 'slave', 'subordinate', 'widow', 'married', 'nurse'
    ]

    # finished=[]
    # lock=Lock()
    serlist = set(serlist)
    serlist = list(serlist)
    sstemp = ['shkd-']
    # finished=[]
    # lock=Lock()
    try:
        with open('log.txt', 'r') as f:
            finished = f.read()
        finished = finished.split('\n')
    except FileNotFoundError:
        finished = []
    pools = Pool(4)
    for search in serlist:
        if search in finished:
            print('%s has finished' % search)
            continue

        pools.apply_async(main, (search, ))
    pools.close()
    pools.join()
Exemple #42
0
class Collector(object):
    def __init__(self, hdfs_app_path, kafka_topic, conf_type):

        self._initialize_members(hdfs_app_path, kafka_topic, conf_type)

    def _initialize_members(self, hdfs_app_path, kafka_topic, conf_type):

        # getting parameters.
        self._logger = logging.getLogger('SPOT.INGEST.FLOW')
        self._hdfs_app_path = hdfs_app_path
        self._kafka_topic = kafka_topic

        # get script path
        self._script_path = os.path.dirname(os.path.abspath(__file__))

        # read flow configuration.
        conf_file = "{0}/ingest_conf.json".format(
            os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]

        # set configuration.
        self._collector_path = self._conf['collector_path']
        self._dsource = 'flow'
        self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource)

        self._supported_files = self._conf['supported_files']

        # create collector watcher
        self._watcher = FileWatcher(self._collector_path,
                                    self._supported_files)

        # Multiprocessing.
        self._processes = conf["collector_processes"]
        self._ingestion_interval = conf["ingestion_interval"]
        self._pool = Pool(processes=self._processes)

    def start(self):

        self._logger.info("Starting FLOW ingest")
        self._watcher.start()

        try:
            while True:
                self._ingest_files_pool()
                time.sleep(self._ingestion_interval)
        except KeyboardInterrupt:
            self._logger.info("Stopping FLOW collector...")
            Util.remove_kafka_topic(self._kafka_topic.Zookeeper,
                                    self._kafka_topic.Topic, self._logger)
            self._watcher.stop()
            self._pool.terminate()
            self._pool.close()
            self._pool.join()
            SystemExit("Ingest finished...")

    def _ingest_files_pool(self):

        if self._watcher.HasFiles:

            for x in range(0, self._processes):
                file = self._watcher.GetNextFile()
                resutl = self._pool.apply_async(
                    ingest_file,
                    args=(
                        file,
                        self._kafka_topic.Partition,
                        self._hdfs_root_path,
                        self._kafka_topic.Topic,
                        self._kafka_topic.BootstrapServers,
                    ))
                #resutl.get() # to debug add try and catch.
                if not self._watcher.HasFiles: break
        return True
Exemple #43
0
    def make_binary_dataset(vocab, input_prefix, output_prefix, lang,
                            num_workers):
        logger.info("[{}] Dictionary: {} types".format(lang, len(vocab)))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        offsets = find_offsets(input_file, num_workers)
        (first_chunk, *more_chunks) = zip(offsets, offsets[1:])
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id, (start_offset, end_offset) in enumerate(more_chunks,
                                                                   start=1):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        prefix,
                        lang,
                        start_offset,
                        end_offset,
                    ),
                    callback=merge_result,
                )
            pool.close()

        ds = indexed_dataset.make_builder(
            dataset_dest_file(args, output_prefix, lang, "bin"),
            impl=args.dataset_impl,
            vocab_size=len(vocab),
        )
        merge_result(
            Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                offset=first_chunk[0],
                end=first_chunk[1],
            ))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        logger.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            ))
def para_segment_compress_eigerdata(images,
                                    mask,
                                    md,
                                    filename,
                                    num_sub=100,
                                    bad_pixel_threshold=1e15,
                                    hot_pixel_threshold=2**30,
                                    bad_pixel_low_threshold=0,
                                    nobytes=4,
                                    bins=1,
                                    dtypes='images',
                                    reverse=True,
                                    num_max_para_process=50):
    '''
    parallelly compressed eiger data without header, this function is for parallel compress
    '''

    if dtypes == 'uid':
        uid = md['uid']  #images
        detector = get_detector(db[uid])
        images_ = load_data(uid, detector, reverse=reverse)
        N = len(images_)
    else:
        N = len(images)
    #N = int( np.ceil( N/ bins  ) )
    num_sub *= bins
    if N % num_sub:
        Nf = N // num_sub + 1
        print(
            'The average image intensity would be slightly not correct, about 1% error.'
        )
        print(
            'Please give a num_sub to make reminder of Num_images/num_sub =0 to get a correct avg_image'
        )
    else:
        Nf = N // num_sub
    print('It will create %i temporary files for parallel compression.' % Nf)

    if Nf > num_max_para_process:
        N_runs = np.int(np.ceil(Nf / float(num_max_para_process)))
        print(
            'The parallel run number: %s is larger than num_max_para_process: %s'
            % (Nf, num_max_para_process))
    else:
        N_runs = 1
    result = {}
    #print( mask_filename )# + '*'* 10 + 'here' )
    for nr in range(N_runs):
        if (nr + 1) * num_max_para_process > Nf:
            inputs = range(num_max_para_process * nr, Nf)
        else:
            inputs = range(num_max_para_process * nr,
                           num_max_para_process * (nr + 1))
        fns = [filename + '_temp-%i.tmp' % i for i in inputs]
        #print( nr, inputs, )
        pool = Pool(processes=len(inputs))  #, maxtasksperchild=1000 )
        #print( inputs )
        for i in inputs:
            if i * num_sub <= N:
                result[i] = pool.apply_async(segment_compress_eigerdata, [
                    images, mask, md, filename + '_temp-%i.tmp' % i,
                    bad_pixel_threshold, hot_pixel_threshold,
                    bad_pixel_low_threshold, nobytes, bins, i * num_sub,
                    (i + 1) * num_sub, dtypes, reverse
                ])

        pool.close()
        pool.join()
        pool.terminate()
    return result
def stepGuidedGradient(data, problem=None, purpose=None):
    currCoef = dualSolve(data, problem=problem, purpose="init")
    originalCoef = deepcopy(currCoef)

    iteration = 0
    epsilon = 0  # TODO: declare convergence gap
    updatedData = deepcopy(data['incidentData'])

    resultName = ''  # TODO: declare the file name for storing results
    maxIter = 20  # TODO: set maximum number of iterations for the gradient algorithm

    # while likelihood does not suffer
    with open(resultName, 'w+') as f:
        while iteration < maxIter:
            f.write("Iteration : {}\n".format(iteration))
            inputs = []
            for tempData in updatedData:
                # set scale to true
                inputs.append(
                    [tempData, currCoef, data['df'], data['neighborGraph']])

            coreCount = multiprocessing.cpu_count()
            pool = Pool(coreCount - 2)  # leave two cores
            results = pool.map(getShiftsSplit, inputs)
            pool.close()
            pool.join()

            # aggregate results
            updatedDF = deepcopy(data['df'])
            # TODO: mark each shift in the updated df set using 'results'
            # TODO: create training data based on shifts
            updatedCountData = None
            '''Take Gradient Step'''
            # create vector of times (x), features (w). Can also be numpy arrays
            times = []
            features = []
            oldCoef = deepcopy(currCoef)

            if problem == "poisson":
                currCoef = doGradientStepPoissonVector(times, features,
                                                       currCoef)
                updatedCountData = None  # TODO: create training data based on shifts
                newLikelihood = getTotalLikelihoodPoisson(
                    updatedCountData, currCoef)
                likelihoodPriorDefender = getTotalLikelihoodPoisson(
                    updatedCountData, oldCoef)

            elif problem == "logistic":
                currCoef = doGradientStepLogisticVectorized(
                    times, features, currCoef)
                updatedCountData = None  # TODO: create training data based on shifts
                newLikelihood = getTotalLikelihoodLogistic(
                    updatedCountData, currCoef)
                likelihoodPriorDefender = getTotalLikelihoodLogistic(
                    updatedCountData, oldCoef)

            f.write("Likelihood at iteration {} before gradient step is {}\n".
                    format(iteration, likelihoodPriorDefender))
            f.write("Likelihood at iteration {} after graident step is {}\n".
                    format(iteration, newLikelihood))

            gap = likelihoodPriorDefender - newLikelihood

            if newLikelihood > likelihoodPriorDefender and abs(gap) < epsilon:
                f.flush()
                break

            iteration += 1

    return currCoef, originalCoef
def createJHMDBParallel(db_settings, logger):
    frame_format = db_settings['frame_format']
    action_name = db_settings['action_name']
    video_name = db_settings['video_name']
    annotation_path = db_settings['annotation_path']
    segmented_path = db_settings['segmented_path']
    orig_path = db_settings['orig_path']
    level = int(db_settings['level'])
    frame = db_settings['frame']
    n_neg = db_settings['number_of_negatives']
    pickle_path = db_settings['pickle_path']
    neighbor_num = db_settings['number_of_neighbors'] #TODO add this to db_settings in experimentSetup
    database_path = db_settings['database_path']
    database_list_path = db_settings['database_list_path']
    features_path = db_settings['features_path']
    feature_type = db_settings['feature_type']
    if type(feature_type) is str:
        feature_type = getattr(FeatureType, feature_type)
    print feature_type
    if (type(feature_type) is list) and type(feature_type[0]) is str:
        feature_type = map(lambda x: getattr(FeatureType, x), feature_type)
    labelledlevelvideo_path = db_settings['voxellabelledlevelvideo_path']
    optical_flow_path = db_settings['optical_flow_path']
    output_path = db_settings['output_path']
    print 'output_path is:',output_path
    compute_segment = db_settings['compute_segment']
    #TODO: maybe we should save them segarately
    #TODO: write a merge segment function?
    logger.log('*** Segment parsing ***')
    keys = ['target', 'negative'] + [ 'neighbor{0}'.format(i) for i in range(neighbor_num)]
    fcn_path = db_settings['fcn_path']
    for action in action_name:
        for video in video_name[action]:
            logger.log('Processing action:`{action}`, video:`{video}`:'.format(action=action, video=video))
            try:
                annotator = JA(annotation_path.format(action_name=action, video_name=video))
            except:
                annotator = None
            segmentor_list = []
            if compute_segment:
                for i in xrange(frame):
                    print 'segment of frame:', i
                    segmentor = MySegmentation(orig_path.format(action_name=action, video_name=video, level=level)+frame_format,
                                    segmented_path.format(action_name=action, video_name=video, level=level)+frame_format,
                                    '',# features_path.format(action_name=action, video_name=video, level=level),
                                    annotator,
                                    None,
                                    labelledlevelvideo_path.format(action_name=action, video_name=video, level=level),
                                    optical_flow_path.format(action_name=action, video_name=video, level=level)+frame_format,
                                    negative_neighbors=n_neg,
                                    fcn_path=fcn_path.format(action_name=action, video_name=video, level=level)+frame_format,
                                    output_path=output_path)
                    segmentor.setFeatureType(feature_type)
                    segmentor_list.append((i, segmentor))
                    # segmentor_list.append((i, MySegmentation(orig_path.format(d)+frame_format, seg_path.format(d,level)+frame_format, annotator)))
                # parallelProcess = lambda pair: pair[1].processNewFrame(pair[0]) #pair = (frame_number, segment)
                print 'frame number:', frame
                s = time.time()
                print 'parallelizing begins', 'Elapsed time:', time.time()-s
                parallel = True
                if parallel:
                    from multiprocessing import Pool
                    print 'create pool'
                    #this is the real one
                    pool = Pool(8)
                    print 'defining function'
                    parallelized_segmentor_list = pool.map(parallelProcess, segmentor_list)
                    pool.close()
                    pool.join()
                else:
                    parallelized_segmentor_list = []
                    for segment_frame_chunk_small_bad in segmentor_list:
                        parallelized_segmentor_list.append(parallelProcess(segment_frame_chunk_small_bad))
                del segmentor_list
                # for segg in parallelized_segmentor_list:
                    # segg.processNewFramePar(i+1)
                print 'Parallelizing DONE.', 'Elapsed time: ', time.time()-s

                print 'Merging begins'
                s = time.time()
                segmentor = parallelized_segmentor_list[-1]
                print len(parallelized_segmentor_list)
                for i in xrange(len(parallelized_segmentor_list)-1):
                    try:
                        segmentor.merge(parallelized_segmentor_list[i])
                    except:
                        print 'Bad Segment', i
                        raise
                    parallelized_segmentor_list[i] = 0
                # segmentor.current_frame = len(parallelized_segmentor_list)
                print 'Mernging DONE. Elapsed time: ', time.time()-s
                # for i in xrange(frame):
                    # logger.log('frame {0}'.format(i+1))
                    # segmentor.processNewFrame()
                segmentor.doneProcessing()
                getFeatFrom = lambda sv, feat_name: getattr(sv, 'get%s' % feat_name)()
                logger.log("Total number of supervoxels: {0}".format(len(segmentor.supervoxels)))
                data = {}
                for feat_type in feature_type:
                    feature_name = feat_type.name
                    feature_len = getFeatFrom(segmentor.supervoxels_list[0], feature_name).shape[1]
                    features = np.zeros((len(segmentor.supervoxels_list), feature_len))
                    for i,sv in enumerate(segmentor.supervoxels_list):
                        features[i] = getFeatFrom(sv, feature_name)
                        if feature_name == 'FCN':
                            features[i] = _scale(features[i])
                    data[feature_name] = features
                    # np.savez(features_path.format(action_name=action_name, feature_name=feature_name), **{feature_name:features})
                centers = np.zeros((len(segmentor.supervoxels), 3))
                colors = np.zeros((len(segmentor.supervoxels), 3), dtype=np.int8)
                for i, sv in enumerate(segmentor.supervoxels_list):
                    centers[i]= sv.center()
                    colors[i] = sv.ID
                data['centers'] = centers
                data['colors'] = colors
                del segmentor
                logger.log('Saving data')
                s = time.time()
                np.savez(features_path.format(action_name=action, feature_name='features', level=level), **data)
                # logger.log('*** Pickling ***')
                # s = time.time()
                # logger.log('Elapsed time: {0}'.format(time.time()-s))
                # pickle.dump(segmentor, open(pickle_path.format(action_name=action, video_name=video, level=level), 'w'))
                # s = time.time()
                # logger.log('Piclking action:`{action}`, video:`{video}` ...'.format(action=action, video=video))
                # logger.log('*** Collecting features / Creating databases ***')
            else: # Don't compute segments
                logger.log('No need to compute segments')
                logger.log('loading features')
                data = np.load(features_path.format(action_name=action, feature_name='features', level=level))
                # segmentor = pickle.load(open(pickle_path.format(action_name=action, video_name=video, level=level), 'r'))
                # segmentor.output_path = output_path
                # segmentor.__class__ = MySegmentation

            createVSB100Database(data, db_settings, logger)
            logger.log("Segment {0} Done!\n".format(action))
    write_db_list(db_settings, logger)
    logger.log('done!')
Exemple #47
0
def main():
    mapURL = {}

    logo()
    parser = argparse.ArgumentParser(description='Ragpicker Malware Crawler')
    parser.add_argument("-a",
                        "--artwork",
                        help="Show artwork",
                        action="store_true",
                        required=False)
    parser.add_argument("-p",
                        "--processes",
                        type=int,
                        default=3,
                        help="Number of processes (default=3, max=6)")
    parser.add_argument("-u",
                        "--url",
                        help="Download and analysis from a single URL")
    parser.add_argument("-d",
                        "--directory",
                        help="Load files from local directory")
    parser.add_argument("-i",
                        "--info",
                        help="Print Ragpicker config infos",
                        action="store_true",
                        required=False)
    parser.add_argument("-da",
                        "--delete",
                        help="Delete all stored data",
                        action="store_true")
    parser.add_argument('--log-level',
                        default=logging.INFO,
                        help='logging level, default=logging.INFO')
    parser.add_argument('--log-filename', help='logging filename')
    parser.add_argument('--version',
                        action='version',
                        version='Ragpicker version ' + RAGPICKER_VERSION)

    global args
    args = parser.parse_args()

    if args.artwork:
        try:
            while True:
                time.sleep(1)
                logo()
        except KeyboardInterrupt:
            return

    if args.log_level:
        log_conf = dict(
            level=args.log_level,
            format='%(levelname)s %(name)s %(module)s:%(lineno)d %(message)s')

        if args.log_filename:
            log_conf['filename'] = args.log_filename
            log.info("log-filename: " + args.log_filename)

        logging.basicConfig(**log_conf)

    if args.delete:
        worker = Worker()
        worker.runDelete()
        return

    if args.info:
        printRagpickerInfos(True)
        return

    if args.url:
        log.info(color("Download and analysis from %s" % args.url, RED))
        runWorker(args.url)
    elif args.directory:
        printRagpickerInfos()
        log.info(
            color("Load files from local directory %s" % args.directory, RED))
        mapURL = getLocalFiles(args.directory)
    else:
        printRagpickerInfos()
        # Malware URLs Crawlen
        mapURL = runCrawler()

    # Max Threads=6
    if args.processes > 6:
        args.processes = 6

    log.info(color("Processes: " + str(args.processes), RED))
    log.info(color("Process " + str(len(mapURL)) + " URLs", RED))

    # Create Process Pool
    pool = Pool(processes=args.processes)

    # Malware Download, process and reporting
    for url in mapURL.values():
        pool.apply_async(runWorker, args=(url, ))

    pool.close()
    pool.join()
Exemple #48
0
        print('账号密码错误')
        exit(0)
    else:
        print('Login success')
    # 参数1:服务器url
    # 参数2:用户accesstoken
    wsp = wsgamePlayer(serverurl, utoken)

    wsp.start()
    while (wsp.getStatic()):
        time.sleep(1)
    userlist = wsp.getList()
    pp = Pool()
    tlist = []

    nowNum = 1
    for pid in userlist:
        # 参数1:服务器ip #参数2:用户accesstoken #参数3:pid  
        # 注意 若需要跳过某角色,请查询某角色id后, 输入  
        # if pid == 'xxxxxxxx': 
        #     continue
        if nowNum >= int(startaccount) and nowNum <= int(stopaccount):
            print("start")
            result = pp.apply_async(run,args=(serverurl, utoken, pid ,))
            tlist.append(result)
        nowNum = nowNum + 1
    pp.close()
    pp.join()

    print("操作结束")
def indiv_scope_run(sbml_dir, seeds, output_dir, cpu_number=1):
    """Run Menetools and analyse individual metabolic capabilities.
    
    Args:
        sbml_dir (str): directory of SBML files
        seeds (str): SBML seeds file
        output_dir (str): directory for results
        cpu_number (int): number of CPU to use for multiprocessing
    
    Returns:
        str: path to output file for scope from Menetools analysis
    """
    logger.info('######### Running individual metabolic scopes #########')

    menetools_dir = os.path.join(output_dir, 'indiv_scopes')
    indiv_scopes_path = os.path.join(menetools_dir, 'indiv_scopes.json')
    produced_seeds_path = os.path.join(menetools_dir,
                                       'indiv_produced_seeds.json')

    if not utils.is_valid_dir(menetools_dir):
        logger.critical('Impossible to access/create output directory')
        sys.exit(1)

    all_files = [
        f for f in os.listdir(sbml_dir)
        if os.path.isfile(os.path.join(sbml_dir, f)) and utils.get_extension(
            os.path.join(sbml_dir, f)).lower() in ['xml', 'sbml']
    ]
    all_scopes = {}
    all_produced_seeds = {}
    multiprocessing_indiv_scopes = []
    for f in all_files:
        bname = utils.get_basename(f)
        sbml_path = os.path.join(sbml_dir, f)
        multiprocessing_indiv_scopes.append((sbml_path, bname, seeds))

    menescope_pool = Pool(cpu_number)
    results = menescope_pool.starmap(indiv_scope_on_species,
                                     multiprocessing_indiv_scopes)
    for result in results:
        error = result[0]
        if error is True:
            logger.critical(
                '------------An error occurred during M2M run of Menetools, M2M will stop-------------'
            )
            menescope_pool.close()
            menescope_pool.join()
            sys.exit(1)
        bname = result[1]
        menescope_results = result[2]
        all_scopes[bname] = menescope_results['scope']
        all_produced_seeds[bname] = menescope_results['produced_seeds']

    menescope_pool.close()
    menescope_pool.join()

    with open(indiv_scopes_path, 'w') as dumpfile:
        json.dump(all_scopes, dumpfile, indent=4)

    with open(produced_seeds_path, 'w') as dumpfile:
        json.dump(all_produced_seeds, dumpfile, indent=4)

    return indiv_scopes_path
Exemple #50
0
def get_iou(args, data_list, class_num, save_path=None):
    from multiprocessing import Pool
    from utils.metric import ConfusionMatrix

    ConfM = ConfusionMatrix(class_num)
    f = ConfM.generateM
    pool = Pool()
    m_list = pool.map(f, data_list)
    pool.close()
    pool.join()

    for m in m_list:
        ConfM.addM(m)

    aveJ, j_list, M = ConfM.jaccard()

    if args.dataset == 'pascal_voc':
        classes = np.array((
            'background',  # always index 0
            'aeroplane',
            'bicycle',
            'bird',
            'boat',
            'bottle',
            'bus',
            'car',
            'cat',
            'chair',
            'cow',
            'diningtable',
            'dog',
            'horse',
            'motorbike',
            'person',
            'pottedplant',
            'sheep',
            'sofa',
            'train',
            'tvmonitor'))
    elif args.dataset == 'pascal_context':
        classes = np.array(
            ('background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
             'bus', 'car', 'cat', 'chair', 'cow', 'table', 'dog', 'horse',
             'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
             'tvmonitor', 'bag', 'bed', 'bench', 'book', 'building', 'cabinet',
             'ceiling', 'cloth', 'computer', 'cup', 'door', 'fence', 'floor',
             'flower', 'food', 'grass', 'ground', 'keyboard', 'light',
             'mountain', 'mouse', 'curtain', 'platform', 'sign', 'plate',
             'road', 'rock', 'shelves', 'sidewalk', 'sky', 'snow',
             'bedclothes', 'track', 'tree', 'truck', 'wall', 'water', 'window',
             'wood'))
    elif args.dataset == 'cityscapes':
        classes = np.array(
            ("road", "sidewalk", "building", "wall", "fence", "pole",
             "traffic_light", "traffic_sign", "vegetation", "terrain", "sky",
             "person", "rider", "car", "truck", "bus", "train", "motorcycle",
             "bicycle"))

    for i, iou in enumerate(j_list):
        print('class {:2d} {:12} IU {:.2f}'.format(i, classes[i], j_list[i]))

    print('meanIOU: ' + str(aveJ) + '\n')
    if save_path:
        with open(save_path, 'w') as f:
            for i, iou in enumerate(j_list):
                f.write('class {:2d} {:12} IU {:.2f}'.format(
                    i, classes[i], j_list[i]) + '\n')
            f.write('meanIOU: ' + str(aveJ) + '\n')
Exemple #51
0
def process_logs(settings_file):
    """
    Convert all lcm logs in a directory to rosbags.

    Args:
        dir_settings (dict): Container for all settings for the operation

    """

    # Load settings
    with open(settings_file, 'r') as f:
        dir_settings = json.load(f)

    dir_lcm_logs = os.path.expanduser(dir_settings["dir_lcm_logs"])
    if not os.path.isdir(dir_lcm_logs):
        sys.stderr.write("%s is not a directory." % dir_lcm_logs)
        sys.exit(1)

    dir_rosbags = os.path.expanduser(dir_settings["dir_rosbags"])
    if os.path.exists(dir_rosbags):
        if not os.path.isdir(dir_rosbags):
            sys.stderr.write("%s exists but it's not a directory." %
                             dir_rosbags)
            sys.exit(1)
        elif os.listdir(dir_rosbags) and not dir_settings["override"]:
            sys.stderr.write(
                "The given directory, %s, exists but it's not empty.\n"
                "Set 'override' to true if this is intended." % dir_rosbags)
            sys.exit(1)
    else:
        os.makedirs(dir_rosbags)

    log_list = [f for f in glob2.glob(os.path.join(dir_lcm_logs, '**/*.log'))]
    num_logs = len(log_list)

    print(("Converting %r logs from %s to %s" %
           (num_logs, dir_lcm_logs, dir_rosbags)))

    # Helper function for parallel processing
    def getArgsForParallelMap(logname):
        log_path = os.path.join(dir_lcm_logs, logname)
        rosbag_path = os.path.join(dir_rosbags,
                                   os.path.splitext(logname)[0] + '.bag')
        csv_path = os.path.join(
            dir_lcm_logs,
            os.path.splitext(logname)[0] + '_poses_centered.csv')

        rosbag_path = rosbag_path.replace(dir_lcm_logs, dir_rosbags)
        os.makedirs(os.path.split(rosbag_path)[0])

        return (log_path, rosbag_path, csv_path, dir_settings)

    tasks = [getArgsForParallelMap(log) for log in log_list]

    threads = Pool()
    # Shows progress.
    for _ in tqdm(threads.imap_unordered(process_one_log, tasks),
                  total=len(tasks)):
        pass
    threads.close()
    threads.join()

    # for i, log_name in enumerate(log_list):
    #     # print overall progress
    #     progress = (i+1)*100./num_logs
    #     print("%d%%\t" % progress)
    #     log_path = os.path.join(dir_lcm_logs, log_name)
    #     rosbag_path = os.path.join(dir_rosbags, os.path.splitext(log_name)[0] + '.bag')
    #     print dir_rosbags

    #     csv_path = os.path.join(dir_lcm_logs, os.path.splitext(log_name)[0] + '_poses_centered.csv')
    #     rosbag_path = rosbag_path.replace(dir_lcm_logs, dir_rosbags)
    #     os.makedirs(os.path.split(rosbag_path)[0])
    #     print rosbag_path
    #     print log_path
    #     print csv_path
    #     process_one_log((log_path, rosbag_path, csv_path, dir_settings))

    print(("Done writing logs to %s ." % dir_rosbags))
	def do_plot(self, inputDir, plotOutDir, plotOutFileName, simDataFile, validationDataFile, metadata):
		if not os.path.isdir(inputDir):
			raise Exception, 'inputDir does not currently exist as a directory'

		ap = AnalysisPaths(inputDir, variant_plot=True)
		variants = ap.get_variants()
		n_variants = len(variants)

		if n_variants <= 1:
			print('This plot only runs for multiple variants'.format(__name__))
			return

		filepath.makedirs(plotOutDir)

		# Load validation data
		validation_data = cPickle.load(open(validationDataFile, 'rb'))
		toya_reactions = validation_data.reactionFlux.toya2010fluxes['reactionID']
		toya_fluxes = np.array([x.asNumber(DCW_FLUX_UNITS) for x in validation_data.reactionFlux.toya2010fluxes['reactionFlux']])
		outlier_filter = [False if rxn in OUTLIER_REACTIONS else True for rxn in toya_reactions]

		# Arrays to populate for plots
		lambdas = np.zeros(n_variants)
		n_sims = np.zeros(n_variants)
		growth_rates = np.zeros(n_variants)
		conc_correlation = np.zeros(n_variants)
		n_conc_off_axis = np.zeros(n_variants)
		flux_correlation = np.zeros(n_variants)
		nonzero_flux_correlation = np.zeros(n_variants)
		n_flux_above_0 = np.zeros(n_variants)
		n_flux_off_axis = np.zeros(n_variants)
		correlation_coefficient = np.zeros(n_variants)
		filtered_correlation_coefficient = np.zeros(n_variants)
		homeostatic_objective_value = np.zeros(n_variants)
		kinetic_objective_value = np.zeros(n_variants)
		homeostatic_objective_std = np.zeros(n_variants)
		kinetic_objective_std = np.zeros(n_variants)

		# Pull information from sim data and listeners in parallel
		pool = Pool(processes=parallelization.plotter_cpus())
		args = zip(
			variants,
			[ap] * n_variants,
			[toya_reactions] * n_variants,
			[toya_fluxes] * n_variants,
			[outlier_filter] * n_variants
			)
		results = pool.map(analyze_variant, args)
		pool.close()
		pool.join()
		for i, result in enumerate(results):
			(lambdas[i],
				n_sims[i],
				growth_rates[i],
				conc_correlation[i],
				n_conc_off_axis[i],
				flux_correlation[i],
				n_flux_off_axis[i],
				nonzero_flux_correlation[i],
				n_flux_above_0[i],
				correlation_coefficient[i],
				filtered_correlation_coefficient[i],
				kinetic_objective_value[i],
				kinetic_objective_std[i],
				homeostatic_objective_value[i],
				homeostatic_objective_std[i],
				n_metabolites,
				n_fluxes) = result

		tick_labels = [r'$10^{%i}$' % (np.log10(x),) if x != 0 else '0' for x in lambdas]
		lambdas = [np.log10(x) if x != 0 else np.nanmin(np.log10(lambdas[lambdas != 0]))-1 for x in lambdas]

		plt.figure(figsize = (8.5, 22))
		plt.style.use('seaborn-deep')
		subplots = 8

		# Growth rates
		ax = plt.subplot(subplots, 1, 1)
		plt.bar(lambdas, growth_rates / growth_rates[0], align='center')
		plt.axhline(1, linestyle='--', color='k')
		plt.ylim([0, 2])
		plt.ylabel('Growth rate deviation\nfrom no kinetics')
		whitePadSparklineAxis(ax, xAxis=False)
		plt.yticks([0, 1, 2])

		# Flux target comparisons
		ax = plt.subplot(subplots, 1, 2)
		plt.bar(lambdas, nonzero_flux_correlation, align='center')
		plt.ylim([0, 1])
		plt.ylabel('Kinetic target flux PCC')
		whitePadSparklineAxis(ax, xAxis=False)

		ax = plt.subplot(subplots, 1, 3)
		plt.bar(lambdas, n_flux_above_0 / n_fluxes, align='center')
		plt.ylim([0, 1])
		plt.ylabel('Fraction of fluxes\nabove 0')
		whitePadSparklineAxis(ax, xAxis=False)

		ax = plt.subplot(subplots, 1, 4)
		plt.bar(lambdas, n_flux_off_axis / n_fluxes, align='center')
		plt.ylim([0, 1])
		plt.ylabel('Fraction of fluxes\noff axis (>{:.0f}%)'.format(FRAC_FLUX_OFF_AXIS*100))
		whitePadSparklineAxis(ax, xAxis=False)

		# Metabolite comparisons
		ax = plt.subplot(subplots, 1, 5)
		plt.bar(lambdas, conc_correlation, align='center')
		plt.ylim([0, 1])
		plt.ylabel('Concentration PCC')
		whitePadSparklineAxis(ax, xAxis=False)

		ax = plt.subplot(subplots, 1, 6)
		plt.bar(lambdas, n_conc_off_axis / n_metabolites, align='center')
		plt.ylim([0, 1])
		plt.ylabel('Fraction of concentrations\noff axis (>{:.0f}%)'.format(FRAC_CONC_OFF_AXIS*100))
		whitePadSparklineAxis(ax, xAxis=False)

		# Toya comparison
		ax = plt.subplot(subplots, 1, 7)
		plt.bar(lambdas, filtered_correlation_coefficient, align='center')
		plt.ylim([0, 1])
		plt.ylabel('Central carbon flux PCC')
		whitePadSparklineAxis(ax, xAxis=False)

		# Viable sims
		ax = plt.subplot(subplots, 1, 8)
		plt.bar(lambdas, n_sims, align='center')
		plt.ylabel('Number of sims\nwith data')
		whitePadSparklineAxis(ax)
		plt.xticks(lambdas, tick_labels)

		plt.xlabel('lambda')

		exportFigure(plt, plotOutDir, plotOutFileName, metadata)

		# Plot kinetic vs homeostatic objective values
		plt.figure(figsize=(3.5, 3.5))
		ax = plt.gca()
		ax.set_xscale("log", nonposx='clip')
		ax.set_yscale("log", nonposy='clip')
		plt.errorbar(homeostatic_objective_value, kinetic_objective_value, xerr=homeostatic_objective_std, yerr=kinetic_objective_std, fmt='none', ecolor='k', alpha=0.5, linewidth=0.5)
		plt.plot(homeostatic_objective_value, kinetic_objective_value, "ob", markeredgewidth=0.1, alpha=0.9)
		for i in range(len(lambdas)):
			plt.text(homeostatic_objective_value[i], 0.6*kinetic_objective_value[i], i, horizontalalignment='center', verticalalignment='center')
		plt.xlabel('Homeostatic Objective Value')
		plt.ylabel('Kinetics Objective Value')

		whitePadSparklineAxis(ax)

		# Adjust limits to get tick labels to display
		xlim = ax.get_xlim()
		xlim = [10**np.floor(np.log10(xlim[0])), 10**np.ceil(np.log10(xlim[1]))]
		ax.set_xticks(xlim)
		ylim = ax.get_ylim()
		ylim = [10**np.floor(np.log10(ylim[0])), 10**np.ceil(np.log10(ylim[1]))]
		ax.set_yticks(ylim)

		exportFigure(plt, plotOutDir, '{}_obj'.format(plotOutFileName), metadata)

		plt.close('all')
def parallel_clustering(read_array, p_emp_probs, args):
    num_batches = args.nr_cores
    read_batches = [
        batch for batch in batch_list(
            read_array, num_batches, batch_type=args.batch_type)
    ]
    print("Using total nucleotide batch sizes:", [
        sum([len(seq) for i, b_i, acc, seq, qual, score in b])
        for b in read_batches
    ])
    print("Nr reads in batches:", [len(b) for b in read_batches])
    cluster_batches = []
    cluster_seq_origin_batches = []
    lowest_batch_index_db = []
    for batch in read_batches:
        tmp_clust = {}
        tmp_clust_origin = {}
        for i, b_i, acc, seq, qual, score in batch:
            tmp_clust[i] = [acc]
            tmp_clust_origin[i] = (i, b_i, acc, seq, qual, score)
        cluster_batches.append(tmp_clust)
        cluster_seq_origin_batches.append(tmp_clust_origin)
        lowest_batch_index_db.append({})
    del read_array

    ####### parallelize alignment #########
    # pool = Pool(processes=mp.cpu_count())
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    signal.signal(signal.SIGINT, original_sigint_handler)
    try:
        mp.set_start_method('spawn')
        print("Environment set:", mp.get_context())
    except RuntimeError:
        print("Environment already set:", mp.get_context())
    it = 1
    while True:
        # Structure up batches
        print()
        print("ITERATION", it)
        print("Using {0} batches.".format(num_batches))

        if len(read_batches) == 1:
            start_cluster = time()

            data = {
                i + 1: ((cluster_batches[0], cluster_seq_origin_batches[0],
                         read_batches[0], p_emp_probs,
                         lowest_batch_index_db[0], 1, args), {})
            }
            result = reads_to_clusters_helper(
                data
            )  # { new_batch_index : (Cluster, cluster_seq_origin, H, new_batch_index)}
            Cluster, cluster_seq_origin, _, _ = result[1]
            print("Time elapesd clustering last iteration single core:",
                  time() - start_cluster)
            return Cluster, cluster_seq_origin

        start_multi = time()
        pool = Pool(processes=int(num_batches))
        try:
            # print([len(b) for b in read_batches])
            data = [{
                i + 1: ((cluster_batches[i], cluster_seq_origin_batches[i],
                         read_batches[i], p_emp_probs,
                         lowest_batch_index_db[i], i + 1, args), {})
            } for i in range(len(read_batches))]
            res = pool.map_async(reads_to_clusters_helper, data)
            cluster_results = res.get(
                999999999
            )  # Without the timeout this blocking call ignores all signals.
        except KeyboardInterrupt:
            print("Caught KeyboardInterrupt, terminating workers")
            pool.terminate()
            sys.exit()
        else:
            pool.close()
        pool.join()

        print("Time elapesd multiprocessing:", time() - start_multi)

        start_joining = time()
        all_repr = []  # all_repr = [top_new_seq_origins]
        all_cl = []
        all_minimizer_databases = {}
        for output_dict in cluster_results:
            print("New batch")
            for k, v in output_dict.items():
                new_clusters, new_representatives, minimizer_database_new, batch_index = v
                print("Batch index", k)
                # for new_clusters, new_representatives, minimizer_database_new, batch_index in cluster_results:
                all_cl.append(new_clusters)
                all_repr.append(new_representatives)
                all_minimizer_databases[batch_index] = minimizer_database_new

        all_clusters = merge_dicts(*all_cl)
        all_representatives = merge_dicts(*all_repr)
        read_array = [
            (i, b_index, acc, seq, qual, score)
            for i, (i, b_index, acc, seq, qual, score,
                    error_rate) in sorted(all_representatives.items(),
                                          key=lambda x: x[1][5],
                                          reverse=True)
        ]
        new_nr_repr = len(read_array)
        print("number of representatives left to cluster:", new_nr_repr)
        print("Time elapesd joining clusters:", time() - start_joining)

        # Determine new number of batches
        if num_batches == 1:
            return all_clusters, all_representatives
        else:
            print_intermediate_results(all_clusters, all_representatives, args,
                                       it)

        it += 1
        read_batches = [
            batch for batch in batch_list(read_array,
                                          num_batches,
                                          batch_type=args.batch_type,
                                          merge_consecutive=True)
        ]
        num_batches = len(read_batches)
        print("Batches after pairwise consecutive merge:", num_batches)
        print("Using total nucleotide batch sizes:", [
            sum([len(seq) for i, b_i, acc, seq, qual, score in b])
            for b in read_batches
        ])
        print("Using nr reads batch sizes:", [len(b) for b in read_batches])
        cluster_batches = []
        cluster_seq_origin_batches = []
        lowest_batch_index_db = []
        for batch in read_batches:
            tmp_clust = {}
            tmp_clust_origin = {}
            lowest_batch_index = min([
                prev_batch_index for (read_cl_id, prev_batch_index, acc, seq,
                                      qual, score) in batch
            ])
            for i, b_i, acc, seq, qual, score in batch:
                tmp_clust[i] = all_clusters[i]
                tmp_clust_origin[i] = all_representatives[i]
            cluster_batches.append(tmp_clust)
            cluster_seq_origin_batches.append(tmp_clust_origin)
            lowest_batch_index_db.append(
                all_minimizer_databases[lowest_batch_index])

        del all_minimizer_databases
Exemple #54
0
    r = requests.get(url, stream=True)
    with open(item['title'] + '.mp4.download', "wb") as mp4:
        for chunk in tqdm(r.iter_content()):
            if chunk:
                mp4.write(chunk)
        os.rename(item['title'] + '.mp4.download', item['title'] + '.mp4')
        print(item['title'] + "  下载完成")


if __name__ == '__main__':
    localpath = 'VideoDownload/'

    local = os.path.join(localpath, '.mp4.download')
    print(local)
    with open("../10.json", 'r') as f:
        fileitem = json.loads(f.read())
        f.close()
    pool = Pool(processes=5)
    res_l = []
    for item in fileitem:
        if item['yesdown'] == 1:
            file_name = item['title']
            #print(item['downurl'])
            res = pool.apply_async(
                Download3, (item, ))  # 维持执行的进程总数为processes,当一个进程执行完毕后会添加新的进程进去
            res_l.append(res)
            #print(item)
    pool.close()  # 关闭进程池,防止进一步操作。如果所有操作持续挂起,它们将在工作进程终止前完成
    pool.join(
    )  # 调用join之前,先调用close函数,否则会出错。执行完close后不会有新的进程加入到pool,join函数等待所有子进程结束
Exemple #55
0
class FrameworkConsole(Console):
    """ Base command processor for the RPL Attacks Framework. """
    prompt = surround_ansi_escapes('{}{}{}{}{}{} '.format(
        colored(getuser(), 'magenta'),
        colored('@', 'cyan'),
        colored(gethostname(), 'blue'),
        colored(':', 'cyan'),
        colored('rpl-attacks', 'red'),
        colored('>>', 'cyan'),
    ))

    def __init__(self, parallel):
        self.continuation_prompt = self.prompt
        self.parallel = parallel
        width, height = get_terminal_size() or MIN_TERM_SIZE
        if any(
                map((lambda s: s[0] < s[1]), zip((height, width),
                                                 MIN_TERM_SIZE))):
            stdout.write("\x1b[8;{rows};{cols}t".format(
                rows=max(MIN_TERM_SIZE[0], height),
                cols=max(MIN_TERM_SIZE[1], width)))
        if self.parallel:
            processes = cpu_count()
            self.__last_tasklist = None
            self.tasklist = {}
            self.pool = Pool(processes, lambda: signal(SIGINT, SIG_IGN))
            atexit.register(self.graceful_exit)
        self.reexec = ['status']
        self.__bind_commands()
        super(FrameworkConsole, self).__init__()
        self.do_loglevel('info')
        self.do_clear('')

    def __bind_commands(self):
        if not self.parallel:
            for attr in ['complete_kill', 'do_kill', 'do_status']:
                delattr(FrameworkConsole, attr)
        for name, func in get_commands():
            longname = 'do_{}'.format(name)
            # set the behavior of the console command (multi-processed or not)
            # setattr(Console, longname, MethodType(FrameworkConsole.start_process_template(func) \
            #                                   if self.parallel and func.behavior.is_multiprocessed else func, self))
            setattr(Console, longname, MethodType(func, self))
            # retrieve parts of function's docstring to make console command's docstring
            parts = func.__doc__.split(':param ')
            description = parts[0].strip()
            arguments = [
                " ".join([l.strip() for l in x.split(":")[-1].split('\n')])
                for x in parts[1:]
            ]
            docstring = COMMAND_DOCSTRING["description"].format(description)
            if len(arguments) > 0:
                arg_descrs = [' - {}:\t{}'.format(n, d or "[no description]") \
                              for n, d in list(zip_longest(signature(func).parameters.keys(), arguments or []))]
                docstring += COMMAND_DOCSTRING["arguments"].format(
                    '\n'.join(arg_descrs))
            if hasattr(func, 'examples') and isinstance(func.examples, list):
                args_examples = [
                    ' >>> {} {}'.format(name, e) for e in func.examples
                ]
                docstring += COMMAND_DOCSTRING["examples"].format(
                    '\n'.join(args_examples))
            setattr(getattr(getattr(Console, longname), '__func__'), '__doc__',
                    docstring)
            # set the autocomplete list of values (can be lazy by using lambda) if relevant
            if hasattr(func, 'autocomplete'):
                setattr(
                    Console, 'complete_{}'.format(name),
                    MethodType(
                        FrameworkConsole.complete_template(func.autocomplete),
                        self))
            if hasattr(func,
                       'reexec_on_emptyline') and func.reexec_on_emptyline:
                self.reexec.append(name)

    def clean_tasks(self):
        """ Method for cleaning the list of tasks. """
        for t in [x for x in self.tasklist.keys() if x.is_expired()]:
            del self.tasklist[t]

    def cmdloop(self, intro=None):
        if self.already_running:
            with open(PIDFILE) as f:
                pid = f.read().strip()
            logger.warn(
                'RPL Attacks Framework is already running in another terminal (PID: {})'
                .format(pid))
            self.graceful_exit()
        else:
            super(FrameworkConsole, self).cmdloop()

    def complete_kill(self, text, *args):
        return sorted([str(i) for i in self.tasklist.keys() if str(i).startswith(text) \
            and i.tasklist[i]['status'] == "PENDING"])

    def complete_loglevel(self, text, *args):
        return sorted(
            [str(i) for i in LOG_LEVELS.keys() if str(i).startswith(text)])

    def do_kill(self, task):
        """
    Kill a task from the pool.
        """
        matching = [
            t for t in self.tasklist.keys()
            if str(t) == task and self.tasklist[t]['status'] == 'PENDING'
        ]
        if len(matching) > 0:
            matching[0].kill()
        else:
            print(
                ' [!] Task {} does not exist or is not a pending task'.format(
                    task))

    def do_loglevel(self, level):
        """
    Change the log level (info|warning|error|debug) [default: info].
        """
        if level in LOG_LEVELS.keys() and set_logging(level):
            print(' [I] Verbose level is now set to: {}'.format(level))
        else:
            print(' [!] Unknown verbose level: {}'.format(level))

    @no_arg_command_except('restart')
    def do_status(self, line):
        """
    Display process pool status.
        """
        self.clean_tasks()
        # this prevents from re-displaying the same status table once ENTER is pressed
        #  (marker 'restart' is handled in emptyline() hereafter
        if line == 'restart' and self.__last_tasklist is not None and \
                        hash(repr(self.tasklist)) == self.__last_tasklist:
            return
        self.__last_tasklist = hash(repr(copy(self.tasklist)))
        if len(self.tasklist) == 0:
            data = [['No task currently running']]
        else:
            data = [['Task', 'Status', 'Result']]
            for task, info in sorted(self.tasklist.items(),
                                     key=lambda x: str(x[0])):
                data.append([
                    str(task).ljust(15), info['status'].ljust(10),
                    str(info['result']).ljust(40)
                ])
        table = SingleTable(data, 'Status of opened tasks')
        table.justify_columns = {0: 'center', 1: 'center', 2: 'center'}
        print(table.table)

    def emptyline(self):
        """ Re-execute last command if it's in the list of commands to be re-executed. """
        try:
            lastcmd = self.lastcmd.split()[0]
        except IndexError:
            return
        if lastcmd in self.reexec:
            if lastcmd == 'status':
                self.lastcmd = 'status restart'
            return self.onecmd(self.lastcmd)

    def graceful_exit(self):
        """ Exit handler for terminating the process pool gracefully. """
        if 'PENDING' in [x['status'] for x in self.tasklist.values()]:
            logger.info(" > Waiting for opened processes to finish...")
            logger.warning(
                "Hit CTRL+C a second time to force process termination.")
            try:
                for task_obj in self.tasklist.keys():
                    # see: http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
                    #  "The KeyboardInterrupt exception won't be delivered until wait() returns, and it never returns,
                    #   so the interrupt never happens. KeyboardInterrupt should almost certainly interrupt a condition
                    #   wait. Note that this doesn't happen if a timeout is specified; cond.wait(1) will receive the
                    #   interrupt immediately. So, a workaround is to specify a timeout."
                    task_obj.task.get(999999)
                self.pool.close()
                #self.pool.join()
            except KeyboardInterrupt:
                logger.info(" > Terminating opened processes...")
                for task_obj in self.tasklist.keys():
                    task_obj.kill()
                self.pool.terminate()
                self.pool.join()
        if not self.already_running:
            os.remove(PIDFILE)

    @staticmethod
    def complete_template(lazy_values):
        """ Template method for handling auto-completion. """
        def _template(self, text, line, start_index, end_index):
            try:
                values = lazy_values()
            except TypeError:
                values = lazy_values or []
            return [v for v in values if v.startswith((text or "").strip())]

        return _template
def modis_live_runner():
    """Listens and triggers processing"""

    LOG.info("*** Start the runner for the MODIS level-1 processing")
    LOG.debug("os.environ = " + str(os.environ))

    # Start checking and dowloading the luts (utcpole.dat and
    # leapsec.dat):
    LOG.info("Checking the modis luts and updating " +
             "from internet if necessary!")
    fresh = check_utcpole_and_leapsec_files(DAYS_BETWEEN_URL_DOWNLOAD)
    if fresh:
        LOG.info("Files in etc dir are fresh! No url downloading....")
    else:
        LOG.warning("Files in etc are non existent or too old. " +
                    "Start url fetch...")
        update_utcpole_and_leapsec_files()

    pool = Pool(processes=6, maxtasksperchild=1)
    manager = Manager()
    listener_q = manager.Queue()
    publisher_q = manager.Queue()

    pub_thread = FilePublisher(publisher_q)
    pub_thread.start()
    listen_thread = FileListener(listener_q)
    listen_thread.start()

    eos_files = {}
    jobs_dict = {}
    while True:

        try:
            msg = listener_q.get()
        except Empty:
            LOG.debug("Empty listener queue...")
            continue

        LOG.debug("Number of threads currently alive: " +
                  str(threading.active_count()))

        LOG.info("EOS files: " + str(eos_files))
        LOG.debug("\tMessage:")
        LOG.debug(msg)

        if 'start_time' in msg.data:
            start_time = msg.data['start_time']
        else:
            LOG.warning("start_time not in message!")
            start_time = None

        if 'end_time' in msg.data:
            end_time = msg.data['end_time']
        else:
            LOG.warning("No end_time in message!")
            end_time = None

        platform_name = msg.data['platform_name']
        orbit_number = int(msg.data['orbit_number'])
        urlobj = urlparse(msg.data['uri'])
        sensor = msg.data.get('sensor', None)

        keyname = (str(platform_name) + '_' + str(orbit_number) + '_' + str(
            start_time.strftime('%Y%m%d%H%M')))
        # Check if we have all the files before processing can start:

        status = ready2run(msg, eos_files, jobs_dict, keyname)
        if status:
            # Run
            LOG.info("Ready to run...")
            LOG.debug("Modisfile = %s", eos_files[keyname]['modisfile'])
            LOG.debug("Packetfile = %s", eos_files[keyname]['packetfile'])

            scene = {
                'platform_name': platform_name,
                'orbit_number': orbit_number,
                'starttime': start_time,
                'endtime': end_time,
                'sensor': sensor,
                'modisfilename': eos_files[keyname]['modisfile'],
                'packetfilename': eos_files[keyname]['packetfile']
            }

            if platform_name in [TERRA, AQUA]:
                # Do processing:
                LOG.info("Level-0 to lvl1 processing on " +
                         "Terra/Aqua MODIS: Start..." + " Start time = " +
                         str(start_time))
                pool.apply_async(run_terra_aqua_l0l1,
                                 (scene, msg, jobs_dict[keyname], publisher_q))
                LOG.debug("Terra/Aqua lvl1 processing sent to pool worker...")
            else:
                LOG.debug("Platform %s not supported yet...",
                          str(platform_name))

            # Block any future run on this scene for x minutes from now
            # x = 5 minutes
            thread_job_registry = threading.Timer(
                5 * 60.0,
                reset_job_registry,
                args=(jobs_dict, eos_files, keyname))
            thread_job_registry.start()

        LOG.debug("Eos-file registry: %s", str(eos_files))

    pool.close()
    pool.join()

    pub_thread.stop()
    listen_thread.stop()
def main():
    fbase = firebase.FirebaseApplication('https://hacker-news.firebaseio.com/',
                                         None)

    con = connect(dbname='hackernews',
                  user='******',
                  host='localhost',
                  password='******')
    cur = con.cursor()

    def log_item(result):
        if result is None:
            print("Response was empty ... continuing")
            return
        keys = [key for key in result if key not in ["kids", "parts"]]
        values = [postgres_escape(result, key) for key in keys]
        try:
            cur.execute("INSERT INTO items (%s) VALUES (%s)" %
                        (",".join(keys), ",".join(values)))
        except:
            f = open("sqlerror.csv", "w")
            f.write("INSERT INTO items (%s) VALUES (%s)" %
                    (",".join(keys), ",".join(values)))
            f.close()
            raise
        if "kids" in result:
            for i in range(0, len(result["kids"])):
                cur.execute(
                    "INSERT INTO item_kid (item_id, kid_id, display_rank) VALUES (%d,%d,%d)"
                    % (result["id"], result["kids"][i], i + 1))
        if "parts" in result:
            for i in range(0, len(result["parts"])):
                cur.execute(
                    "INSERT INTO item_part (item_id, part_id, display_rank) VALUES (%d,%d,%d)"
                    % (result["id"], result["parts"][i], i + 1))
        con.commit()

    cur.execute("SELECT max(id) FROM items")
    res = cur.fetchall()
    max_id = res[0][0] if res[0][0] else 0
    print("Maximum Id Scraped: %d" % max_id)

    max_id_possible = fbase.get("/v0/maxitem", None)
    print("Maximum Id On Firebase: %d" % max_id_possible)
    pool = Pool(5)

    items = range(max_id + 1, max_id_possible + 1)

    loc = 0

    while loc < len(items) - 1:
        try:
            start = time.time()
            batch = min(10000, len(items) - loc - 1)
            for i in items[loc:loc + batch]:
                endpoint = fbase._build_endpoint_url("/v0/item/%d" % i, "")
                result = pool.apply_async(firebase.make_get_request,
                                          args=(endpoint, {}, {}),
                                          callback=log_item)
            result.get(
                0xFFFF)  # specifying a timeout to get the keyboard interrupt
            print("Item: %d, Elapsed Time: %0.2f, Items/Second: %0.2f" %
                  (items[loc + batch], time.time() - start, batch /
                   (time.time() - start)))
            loc += batch
        except requests.exceptions.HTTPError as exc:
            print(exc)
            print("Error on ", i)
            continue
        except KeyboardInterrupt:
            print("Caught KeyboardInterrupt, terminating workers")
            pool.terminate()
            pool.join()

    pool.terminate()
    pool.join()

    cur.close()
    con.close()
Exemple #58
0
    # Fare matching
    # pro_item = ['fbr', 'fbrs', 'spec', 'ocda', 'dcda', 'twocda']
    pro_item = ['fbr']
    #
    for i in range(len(pro_item)):
        msg = 'processing %s' % pro_item[i]
        FileWrite(msg)
        start = datetime.now()
        msg = 'Start Time %s ' % start
        FileWrite(msg)
        with closing(Pool(processes=numworker)) as eventing:
            eventing = Pool(processes=numworker)
        p = eventing.map(eval(pro_item[i]), arg)
        eventing.close()
        eventing.join()
        end = datetime.now()

        msg = 'End Time %s ' % end
        FileWrite(msg)
        msg = '{0} processing takes {1} seconds with {2} processors'.format(
            pro_item[i], (end - start).total_seconds(), base)
        FileWrite(msg)

    myConnection = pymysql.connect(host=hostname,
                                   user=username,
                                   passwd=password,
                                   db=database)
    stmt = 'select count(*) from zz_ws.temp_tbl_fc_fare_map_fbr'
    number = mysql(stmt)
    msg = 'fbr generates {0} records'.format(number)
# Author:Zhang Yuan
#进程池用于限制一次性加载的进程数,防止一次性暴力加载的进程数过多,让cpu瘫痪。
from multiprocessing import Process, Pool, freeze_support
import os, time


def Foo(i):
    time.sleep(0.5)
    print("in process", os.getpid())
    return i + 100


def Bar(arg):
    print('-->exec done:', arg, os.getpid())


if __name__ == '__main__':
    #freeze_support()
    pool = Pool(processes=2)  #允许进程池同时放入5个进程
    print("主进程", os.getpid())
    for i in range(10):
        pool.apply_async(func=Foo, args=(i, ), callback=Bar)  #callback=回调
        #pool.apply(func=Foo, args=(i,)) #串行
        #pool.apply_async(func=Foo, args=(i,)) #串行
    print('end')
    pool.close()  #先要把关闭句柄声明,再进入join等待才可以
    pool.join()  #进程池中进程执行完毕后再关闭,如果注释,那么程序直接关闭。.join()
def create_model(paths, exclusions, thresholds, classifyconditions):
    ''' This is the main function in the module.
    It can be called externally; it's also called
    if the module is run directly.
    '''

    sourcefolder, extension, classpath, outputpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    pastthreshold, futurethreshold = thresholds
    category2sorton, positive_class, datetype, numfeatures, regularization = classifyconditions

    verbose = False

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    # random.shuffle(allthefiles)

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif,
                                       excludeifnot, excludebelow,
                                       excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast. If we want to us more or less everything,
    # this may not be necessary. But in some cases we want to use randomly sampled subsets.

    # The default condition here is

    # category2sorton = 'reviewed'
    # positive_class = 'rev'
    # sizecap = 350
    # A sizecap less than one means, no sizecap.

    IDsToUse, classdictionary = metafilter.label_classes(
        metadict, category2sorton, positive_class, sizecap)

    # make a vocabulary list and a volsize dict
    wordcounts = Counter()

    volspresent = list()
    orderedIDs = list()

    positivecounts = dict()
    negativecounts = dict()

    for volid, volpath in zip(volumeIDs, volumepaths):
        if volid not in IDsToUse:
            continue
        else:
            volspresent.append((volid, volpath))
            orderedIDs.append(volid)

        date = infer_date(metadict[volid], datetype)
        if date < pastthreshold or date > futurethreshold:
            continue
        else:
            with open(volpath, encoding='utf-8') as f:
                for line in f:
                    fields = line.strip().split('\t')
                    if len(fields) > 2 or len(fields) < 2:
                        # print(line)
                        continue
                    word = fields[0]
                    if len(word) > 0 and word[0].isalpha():
                        count = int(fields[1])
                        wordcounts[word] += 1
                        # for initial feature selection we use the number of
                        # *documents* that contain a given word,
                        # so it's just +=1.

    vocablist = [x[0] for x in wordcounts.most_common(numfeatures)]

    # vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000)
    # Feature selection is deprecated. There are cool things
    # we could do with feature selection,
    # but they'd improve accuracy by 1% at the cost of complicating our explanatory task.
    # The tradeoff isn't worth it. Explanation is more important.
    # So we just take the most common words (by number of documents containing them)
    # in the whole corpus. Technically, I suppose, we could crossvalidate that as well,
    # but *eyeroll*.

    donttrainon = list()

    # Here we create a list of volumed IDs not to be used for training.
    # For instance, we have supplemented the dataset with volumes that
    # are in the Norton but that did not actually occur in random
    # sampling. We want to make predictions for these, but never use
    # them for training.

    for idx1, anid in enumerate(orderedIDs):
        reviewedstatus = metadict[anid]['reviewed']
        date = infer_date(metadict[anid], datetype)
        if reviewedstatus == 'addedbecausecanon':
            donttrainon.append(idx1)
        elif date < pastthreshold or date > futurethreshold:
            donttrainon.append(idx1)

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]
    # For every index in authormatches, identify a set of indexes that have
    # the same author. Obvs, there will always be at least one.

    # Since we are going to use these indexes to exclude rows, we also add
    # all the ids in donttrainon to every volume

    for idx1, anid in enumerate(orderedIDs):
        thisauthor = metadict[anid]['author']
        for idx2, anotherid in enumerate(orderedIDs):
            otherauthor = metadict[anotherid]['author']
            if thisauthor == otherauthor and not idx2 in authormatches[idx1]:
                authormatches[idx1].append(idx2)

    for alist in authormatches:
        alist.sort(reverse=True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    volsizes = dict()
    voldata = list()
    classvector = list()

    for volid, volpath in volspresent:

        with open(volpath, encoding='utf-8') as f:
            voldict = dict()
            totalcount = 0
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) > 2 or len(fields) < 2:
                    continue

                word = fields[0]
                count = int(fields[1])
                voldict[word] = count
                totalcount += count

        date = infer_date(metadict[volid], datetype)
        date = date - 1700
        if date < 0:
            date = 0

        if usedate:
            features = get_features_with_date(voldict, vocablist, date,
                                              totalcount)
            voldata.append(features)
        else:
            features = get_features(voldict, vocablist)
            voldata.append(features / (totalcount + 0.001))

        volsizes[volid] = totalcount
        classflag = classdictionary[volid]
        classvector.append(classflag)

    data = pd.DataFrame(voldata)

    sextuplets = list()
    for i, volid in enumerate(orderedIDs):
        listtoexclude = authormatches[i]
        asixtuple = data, classvector, listtoexclude, i, usedate, regularization
        sextuplets.append(asixtuple)

    # Now do leave-one-out predictions.
    print('Beginning multiprocessing.')

    pool = Pool(processes=4)
    res = pool.map_async(modelingprocess.model_one_volume, sextuplets)

    # After all files are processed, write metadata, errorlog, and counts of phrases.
    res.wait()
    resultlist = res.get()

    assert len(resultlist) == len(orderedIDs)

    logisticpredictions = dict()
    for i, volid in enumerate(orderedIDs):
        logisticpredictions[volid] = resultlist[i]

    pool.close()
    pool.join()

    print('Multiprocessing concluded.')

    truepositives = 0
    truenegatives = 0
    falsepositives = 0
    falsenegatives = 0
    allvolumes = list()

    with open(outputpath, mode='w', encoding='utf-8') as f:
        writer = csv.writer(f)
        header = [
            'volid', 'reviewed', 'obscure', 'pubdate', 'birthdate', 'gender',
            'nation', 'allwords', 'logistic', 'author', 'title', 'pubname',
            'actually', 'realclass'
        ]
        writer.writerow(header)
        for volid in IDsToUse:
            metadata = metadict[volid]
            reviewed = metadata['reviewed']
            obscure = metadata['obscure']
            pubdate = infer_date(metadata, datetype)
            birthdate = metadata['birthdate']
            gender = metadata['gender']
            nation = metadata['nation']
            author = metadata['author']
            title = metadata['title']
            canonicity = metadata['canonicity']
            pubname = metadata['pubname']
            allwords = volsizes[volid]
            logistic = logisticpredictions[volid]
            realclass = classdictionary[volid]
            outrow = [
                volid, reviewed, obscure, pubdate, birthdate, gender, nation,
                allwords, logistic, author, title, pubname, canonicity,
                realclass
            ]
            writer.writerow(outrow)
            allvolumes.append(outrow)

            if logistic > 0.5 and classdictionary[volid] > 0.5:
                truepositives += 1
            elif logistic <= 0.5 and classdictionary[volid] < 0.5:
                truenegatives += 1
            elif logistic <= 0.5 and classdictionary[volid] > 0.5:
                falsenegatives += 1
            elif logistic > 0.5 and classdictionary[volid] < 0.5:
                falsepositives += 1

    donttrainon.sort(reverse=True)
    trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0)
    newmodel = LogisticRegression(C=regularization)
    trainingset, means, stdevs = normalizearray(trainingset, usedate)
    newmodel.fit(trainingset, yvals)

    coefficients = newmodel.coef_[0] * 100

    coefficientuples = list(
        zip(coefficients, (coefficients / np.array(stdevs)),
            vocablist + ['pub.date']))
    coefficientuples.sort()
    if verbose:
        for coefficient, normalizedcoef, word in coefficientuples:
            print(word + " :  " + str(coefficient))

    print()
    accuracy = (truepositives + truenegatives) / len(IDsToUse)

    coefficientpath = outputpath.replace('.csv', '.coefs.csv')
    with open(coefficientpath, mode='w', encoding='utf-8') as f:
        writer = csv.writer(f)
        for triple in coefficientuples:
            coef, normalizedcoef, word = triple
            writer.writerow([word, coef, normalizedcoef])

    return accuracy, allvolumes, coefficientuples