Example #1
0
class TaskQueue:
    TASK_STATUS_QUEUED = "QUEUED"
    TASK_STATUS_IN_PROGRESS = "IN_PROGRESS"

    def __init__(self):
        self.logger = logging.getLogger(__name__)
        # Shared objects between gunicorn workers and task executor.
        self.manager = Manager()
        self.mutex = self.manager.Lock()
        self.queuedTasks = self.manager.list(
        )  # multitasking and thread-safe list of tasks
        self.runningTasks = self.manager.list(
        )  # multitasking and thread-safe list of tasks

    def add(self, task):
        self.mutex.acquire()
        try:
            self.queuedTasks.append(task)
        except Exception:
            self.logger.exception(
                "FATAL: Job queue broken, this is a bug, please investigate!")
            raise
        finally:
            self.mutex.release()

    def addStarted(self, task):
        self.mutex.acquire()
        try:
            self.runningTasks.append(task)
        except Exception:
            self.logger.exception(
                "FATAL: Job queue broken, this is a bug, please investigate!")
            raise
        finally:
            self.mutex.release()

    def startNext(self):
        self.mutex.acquire()
        try:
            for i, task in enumerate(self.queuedTasks):
                not_before = task.get('JQ_not_before', None)
                if not_before and not TTLtool.isExpirationtimeIsoExpired(
                        not_before):
                    continue
                self.queuedTasks.pop(i)
                task.pop('JQ_not_before', None)
                self.runningTasks.append(task)
                return task
        except Exception:
            self.logger.exception(
                "FATAL: Job queue broken, this is a bug, please investigate!")
            raise
        finally:
            self.mutex.release()
        return None

    def finalize(self, job_id, rescheduleDeferralTime=None):
        self.mutex.acquire()
        try:
            # Match job_id to find task to drop.
            for i, task in enumerate(self.runningTasks):
                if task['id'] == job_id:
                    self.runningTasks.pop(i)
                    if rescheduleDeferralTime:
                        # Instead of dropping the Job, reschedule it.
                        task[
                            'JQ_not_before'] = TTLtool.calculateExpirationTimeIso(
                                datetime.datetime.utcnow().isoformat(),
                                rescheduleDeferralTime)
                        # Insert in front of the queue, so it gets rescheduled soon.
                        self.queuedTasks.insert(0, task)
                    return
            raise Exception("Job '%s' not found in running tasks list.",
                            job_id)
        except Exception:
            self.logger.exception(
                "FATAL: Job queue broken, this is a bug, please investigate!")
            raise
        finally:
            self.mutex.release()

    def inList(self, job_id):
        self.mutex.acquire()
        try:
            for task in self.runningTasks:
                if task['id'] == job_id:
                    return True
            for task in self.queuedTasks:
                if task['id'] == job_id:
                    return True
        except Exception:
            self.logger.exception(
                "FATAL: Job queue broken, this is a bug, please investigate!")
            raise
        finally:
            self.mutex.release()
        return False

    def list(self):
        tasklist = []
        self.mutex.acquire()
        try:
            for task in self.runningTasks:
                tasklist.append({
                    'Job': task['id'],
                    'Status': TaskQueue.TASK_STATUS_IN_PROGRESS
                })
            for task in self.queuedTasks:
                tasklist.append({
                    'Job': task['id'],
                    'Status': TaskQueue.TASK_STATUS_QUEUED
                })
        except Exception:
            self.logger.exception(
                "FATAL: Job queue broken, this is a bug, please investigate!")
            raise
        finally:
            self.mutex.release()
        return tasklist

    def flushQueued(self):
        nof = 0
        self.mutex.acquire()
        try:
            nof = len(self.queuedTasks)
            self.queuedTasks[:] = []
        except Exception:
            self.logger.exception(
                "FATAL: Job queue broken, this is a bug, please investigate!")
            raise
        finally:
            self.mutex.release()
        return nof
Example #2
0
def main():
    # get params
    params = get_params()
    project = params['project']

    # define file paths
    unique_kmers_file = join(project, 'data', 'preprocessed', 'unique_kmers.txt')
    phenos_file = join(project, 'data', 'raw', params['pheno'])
    samples_file = join(project, 'data', 'raw', params['sample'])
    similarities_tsv = join(project, 'data', 'preprocessed', 'sample_similarities.tsv')
    hist_orig_file = join(project, 'data', 'preprocessed', 'hist_orig.png')
    hist_sim_scaled_file = join(project, 'data', 'preprocessed', 'hist_sim_scaled.png')
    hist_dissim_scaled_file = join(project, 'data', 'preprocessed', 'hist_dissim_scaled.png')
    similar_sample_file = join(project, 'data', 'preprocessed', 'similarSample_obs.txt')
    dissimilar_sample_file = join(project, 'data', 'preprocessed', 'dissimilarSample_obs.txt')
    kmer_sample_file = join(project, 'data', 'preprocessed', 'kmer_sample_map.txt')
    kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmer_pheno_map.txt')
    sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl') 
    pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl')
    uim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl')

    # create and load sample and pheno int maps
    if not file_exists(sim_file):
        int_maps.create_sample_int_map(samples_file, phenos_file, sim_file)
    if not file_exists(pim_file):
        int_maps.create_pheno_int_map(phenos_file, pim_file)
    sim = load_pickle(sim_file)
    
    # only do processing if output files do not exist
    if (not file_exists(kmer_sample_file) or not file_exists(kmer_pheno_file) 
            or ((not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file))
            and not file_exists(similarities_tsv))):
        # dfs holding samples that display vs not display pheno
        dfdisp, dfnodisp = create_disp_nodisp_dfs(phenos_file, sim)
        # read in all sequences in input into python object
        seqs = parse_input(samples_file)
        # number of samples
        n_samples = int(len(sim) / 2)
        # upper and lower bounds for frequency of samples to filter kmers by
        upper = int(params['maxkf'] * n_samples)
        lower = int(params['minkf'] * n_samples)
        # multiprocessing queue for transferring data to the main thread
        m = Manager()
        q = m.Queue()
        # multiprocessing lock for locking file before writing to it
        lock = m.Lock()
        # kmers file name reference for subprocesses to write to
        kmer_sample_file_ref = kmer_sample_file # because the int map uses it
        if file_exists(kmer_sample_file):
            kmer_sample_file_ref = None
        if file_exists(kmer_pheno_file):
            kmer_pheno_file = None
        
        kwargs = dict(raw=seqs, k=params['k'], thresh=params['correlation-thresh'],
                    upper=upper, lower=lower, dfdisp=dfdisp, dfnodisp=dfnodisp,
                    sim=sim, n=n_samples,
                    kmer_sample_file=kmer_sample_file_ref,
                    kmer_pheno_file=kmer_pheno_file)

        process_file(create_kmer_sample_map, unique_kmers_file, q=q, lock=lock, **kwargs)
       
        sample_matrix = np.zeros((n_samples, n_samples))
        num_kmers = 0
        # write all chunks to output files sequentially
        while not q.empty():
            q_num_kmers, q_sample_matrix = q.get()
            num_kmers += q_num_kmers
            sample_matrix += q_sample_matrix
        
        # create sample similarity file if the similarities tsv does not exist
        if not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file):
            similar_sample(sample_matrix, num_kmers, similarities_tsv,
                hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file,
                similar_sample_file, dissimilar_sample_file)
    if (not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file)) and file_exists(similarities_tsv):
        similar_sample(None, None, similarities_tsv, hist_orig_file,
            hist_sim_scaled_file, hist_dissim_scaled_file,
            similar_sample_file, dissimilar_sample_file)
    # create kmer int map
    if not file_exists(uim_file):
        int_maps.create_kmer_int_map(kmer_sample_file, uim_file)
Example #3
0
from multiprocessing import Process, Lock
from multiprocessing import Manager as man
import random
from time import sleep
import sys

man = man()
printLock = man.Lock()
tableLock = man.Lock()
onTable = man.list()


class Person:
    def __init__(self, name, role):
        self.name = name
        self.role = role
        print('Hi my name is ' + name)

    def safePrint(self, toPrint):
        printLock.acquire()
        print(toPrint)
        printLock.release()


class smoker(Person):
    def __init__(self, name, has, role):
        Person.__init__(self, name, role)
        self.has = has
        self.needs = self.getNeeds(self.has)
        print("I need", self.needs)
Example #4
0
def test_linearity(
        run, parameter, checkpoint,
        steps_per_segment, epsilon=1E-6,
        checkpoint_path=None, checkpoint_interval=1, simultaneous_runs=None,
        run_ddt=None, return_checkpoint=False, get_host_dir=None, spawn_compute_job=None):
    """
    """
    compute_outputs = []

    run = RunWrapper(run)
    assert verify_checkpoint(checkpoint)
    u0, V, v, lss, G_lss, g_lss, J_hist, G_dil, g_dil = checkpoint

    manager = Manager()
    interprocess = (manager.Lock(), manager.dict())

    i = lss.K_segments()
    run_id = 'time_dilation_{0:02d}'.format(i)
    if run_ddt is not None:
        time_dil = TimeDilationExact(run_ddt, u0, parameter)
    else:
        time_dil = TimeDilation(run, u0, parameter, run_id,
                                simultaneous_runs, interprocess)

    V = time_dil.project(V)
    v = time_dil.project(v)
    V, v = lss.checkpoint(V, v)
   
    # first run
    print(v.shape, V.shape)
    v1 = v/1.0
    V1 = V[0:2]/1.0
    print(v1.shape, V1.shape)
    printvV(v1, V1, spawn_compute_job, interprocess)

    _, V1, v1, J0, G, g = run_segment(
            run, u0, V1, v1, parameter, i, steps_per_segment,
            epsilon, simultaneous_runs, interprocess, get_host_dir=get_host_dir,
            compute_outputs=compute_outputs, spawn_compute_job=spawn_compute_job)

    print(v1.shape, V1.shape)
    printvV(v1, V1, spawn_compute_job, interprocess)

    # shrink V by 2 and run
    v2 = v/2.0
    V2 = V[0:2]/2.0
    print(v2.shape, V2.shape)
    printvV(v2, V2, spawn_compute_job, interprocess)

    _, V2, v2, J0, G, g = run_segment(
            run, u0, V2, v2, parameter, i, steps_per_segment,
            epsilon, simultaneous_runs, interprocess, get_host_dir=get_host_dir,
            compute_outputs=compute_outputs, spawn_compute_job=spawn_compute_job)

    print(v2.shape, V2.shape)
    printvV(v2, V2, spawn_compute_job, interprocess)

    # check if still within linear region
    print('-'*20)
    print(np.linalg.norm(V1.field))
    print(np.linalg.norm(V2.field)*2.0)
Example #5
0
def multi_mode(cli_parsed):
    dbm = db_manager.DB_Manager(cli_parsed.d + '/ew.db')
    dbm.open_connection()
    if not cli_parsed.resume:
        dbm.initialize_db()
    dbm.save_options(cli_parsed)
    m = Manager()
    targets = m.Queue()
    lock = m.Lock()
    multi_counter = m.Value('i', 0)
    display = None

    def exitsig(*args):
        dbm.close()
        if current_process().name == 'MainProcess':
            print ''
            print 'Resume using ./EyeWitness.py --resume {0}'.format(
                cli_parsed.d + '/ew.db')
        os._exit(1)

    signal.signal(signal.SIGINT, exitsig)
    if cli_parsed.resume:
        pass
    else:
        url_list, rdp_list, vnc_list = target_creator(cli_parsed)
        if any((cli_parsed.web, cli_parsed.headless)):
            for url in url_list:
                dbm.create_http_object(url, cli_parsed)
        for rdp in rdp_list:
            dbm.create_vnc_rdp_object('rdp', rdp, cli_parsed)
        for vnc in vnc_list:
            dbm.create_vnc_rdp_object('vnc', vnc, cli_parsed)

    if any((cli_parsed.web, cli_parsed.headless)):
        if cli_parsed.web and not cli_parsed.show_selenium:
            display = Display(visible=0, size=(1920, 1080))
            display.start()

        multi_total = dbm.get_incomplete_http(targets)
        if multi_total > 0:
            if cli_parsed.resume:
                print 'Resuming Web Scan ({0} Hosts Remaining)'.format(
                    str(multi_total))
            else:
                print 'Starting Web Requests ({0} Hosts)'.format(
                    str(multi_total))

        if multi_total < cli_parsed.threads:
            num_threads = multi_total
        else:
            num_threads = cli_parsed.threads
        for i in xrange(num_threads):
            targets.put(None)
        try:
            workers = [
                Process(target=worker_thread,
                        args=(cli_parsed, targets, lock, (multi_counter,
                                                          multi_total)))
                for i in xrange(num_threads)
            ]
            for w in workers:
                w.start()
            for w in workers:
                w.join()
        except Exception as e:
            print str(e)

        # Set up UA table here
        if cli_parsed.cycle is not None:
            ua_dict = get_ua_values(cli_parsed.cycle)
            if not cli_parsed.ua_init:
                dbm.clear_table("ua")
                completed = dbm.get_complete_http()
                completed[:] = [x for x in completed if x.error_state is None]
                for item in completed:
                    for browser, ua in ua_dict.iteritems():
                        dbm.create_ua_object(item, browser, ua)

                cli_parsed.ua_init = True
                dbm.clear_table("opts")
                dbm.save_options(cli_parsed)

            for browser, ua in ua_dict.iteritems():
                targets = m.Queue()
                multi_counter.value = 0
                multi_total = dbm.get_incomplete_ua(targets, browser)
                if multi_total > 0:
                    print(
                        "[*] Starting requests for User Agent {0}"
                        " ({1} Hosts)").format(browser, str(multi_total))
                if multi_total < cli_parsed.threads:
                    num_threads = multi_total
                else:
                    num_threads = cli_parsed.threads
                for i in xrange(num_threads):
                    targets.put(None)
                workers = [
                    Process(target=worker_thread,
                            args=(cli_parsed, targets, lock,
                                  (multi_counter, multi_total), (browser, ua)))
                    for i in xrange(num_threads)
                ]
                for w in workers:
                    w.start()
                for w in workers:
                    w.join()

    if any((cli_parsed.vnc, cli_parsed.rdp)):
        log._LOG_LEVEL = log.Level.ERROR
        multi_total, targets = dbm.get_incomplete_vnc_rdp()
        if multi_total > 0:
            print ''
            print 'Starting VNC/RDP Requests ({0} Hosts)'.format(
                str(multi_total))

            app = QtGui.QApplication(sys.argv)
            timer = QTimer()
            timer.start(10)
            timer.timeout.connect(lambda: None)

            # add qt4 reactor
            import qt4reactor
            qt4reactor.install()
            from twisted.internet import reactor

            for target in targets:
                if os.path.dirname(cli_parsed.d) != os.path.dirname(
                        target.screenshot_path):
                    target.set_paths(cli_parsed.d)
                tdbm = db_manager.DB_Manager(cli_parsed.d + '/ew.db')
                if target.proto == 'vnc':
                    reactor.connectTCP(
                        target.remote_system, target.port,
                        vnc_module.RFBScreenShotFactory(
                            target.screenshot_path, reactor, app, target,
                            tdbm))
                else:
                    reactor.connectTCP(
                        target.remote_system, int(target.port),
                        rdp_module.RDPScreenShotFactory(
                            reactor, app, 1200, 800, target.screenshot_path,
                            cli_parsed.timeout, target, tdbm))
            reactor.runReturn()
            app.exec_()

    if display is not None:
        display.stop()
    results = dbm.get_complete_http()
    vnc_rdp = dbm.get_complete_vnc_rdp()
    dbm.close()
    m.shutdown()
    write_vnc_rdp_data(cli_parsed, vnc_rdp)
    sort_data_and_write(cli_parsed, results)
Example #6
0
 def get_lock():
     m = Manager()
     l = m.Lock()
     return l
Example #7
0
def dir_to_spectrogram(audio_dir, saving_dir, noise_dir, labels_dir,
                       num_processes, f_length, t_length, noise_power):
    """ Creates spectrograms of all the audio files in a dir

    :param audio_dir: path of directory with audio files
    :param noise_dir: path to nosie audio files
    :return:
    """

    m = Manager()
    l = m.Lock()
    cnt = m.Value('int', 0)
    audio_spectrogram = m.dict({'data': m.list()})

    wav_dir = os.path.join(audio_dir, 'recordings')
    file_names = [
        f for f in os.listdir(wav_dir)
        if os.path.isfile(os.path.join(wav_dir, f)) and '.wav' in f
    ]

    if len(file_names) == 0:
        print('No .wav file in %s' % wav_dir)
        exit(1)

    noise_names = get_noise_names(noise_dir)

    # 4 speakers and 50 wav files of each digit per speaker
    speakers = ['jackson', 'nicolas', 'theo', 'yweweler']
    test_speaker = speakers[-1]

    # 50 noise files
    train_noise_names = noise_names[:40]
    test_noise_names = noise_names[-10:]

    train_category = {str(i): list() for i in range(10)}
    test_category = {str(i): list() for i in range(10)}
    for file_name in file_names:
        if test_speaker in file_name:
            test_category[file_name[0]].append(file_name)
        else:
            train_category[file_name[0]].append(file_name)

    train_labels = np.load(os.path.join(labels_dir, 'train_labels.npy'))
    test_labels = np.load(os.path.join(labels_dir, 'test_labels.npy'))

    train_names = []
    train_noises = []
    test_names = []
    test_noises = []

    idx_list = [0 for i in range(10)]
    noise_idx = 0
    for train_label in train_labels:
        train_names.append(
            train_category[str(train_label)][idx_list[train_label]])
        idx_list[train_label] += 1
        idx_list[train_label] %= 150

        train_noises.append(train_noise_names[noise_idx])
        noise_idx += 1
        noise_idx %= 40

    idx_list = [0 for i in range(10)]
    noise_idx = 0
    for test_label in test_labels:
        test_names.append(test_category[str(test_label)][idx_list[test_label]])
        idx_list[test_label] += 1
        idx_list[test_label] %= 50

        test_noises.append(test_noise_names[noise_idx])
        noise_idx += 1
        noise_idx %= 10

    names = train_names + test_names
    noises = train_noises + test_noises

    print('start')
    pool = Pool(processes=num_processes, initializer=pool_init, initargs=(l, ))
    for i in range(len(names)):
        pool.apply_async(wav_to_spectrogram,
                         args=(wav_dir, names[i], noises[i], f_length,
                               t_length, noise_power, cnt, audio_spectrogram))

    # close the pool and reject all the new process request
    pool.close()
    # waiting until all the tasks are finished
    pool.join()

    data = np.array(audio_spectrogram['data'])

    # data = np.array([i for i in audio_spectrogram['data']])
    # labels = np.array([i for i in audio_spectrogram['labels']])

    train_data = data[:60000]
    test_data = data[60000:]
    np.save(os.path.join(saving_dir, 'train_data.npy'), train_data)
    np.save(os.path.join(saving_dir, 'test_data.npy'), test_data)
Example #8
0
 def __init__(self):
     manager = Manager()
     self.lock = manager.Lock()
     self.queue = manager.Queue()
Example #9
0
def main():
    args = parse_args()

    print(f'{args.load_type} loading!')
    load_func = getattr(bt.datasets, 'load_' + args.load_type)
    contents, classes = load_func(img_dir=args.img_dir,
                                  ann_dir=args.ann_dir,
                                  classes=args.classes,
                                  nproc=args.load_nproc)
    if args.prior_annfile is not None:
        prior, _ = bt.load_pkl(args.prior_annfile, classes=classes)
        bt.merge_prior_contents(contents, prior, merge_type=args.merge_type)

    shown_names = classes if args.shown_names is None \
            else bt.get_classes(args.shown_names)
    assert len(shown_names) == len(classes)

    if isinstance(args.ids, (list, type(None))):
        ids = args.ids
    elif isinstance(args.ids, str):
        if osp.isfile(args.ids):
            with open(args.ids, 'r') as f:
                ids = [l.strip() for l in f]
        else:
            ids = args.ids.split('|')
    else:
        raise TypeError('Wrong base_json input in `ids`')

    tasks, max_label = [], 0
    for content in contents:
        if ids is not None and content['id'] not in ids:
            pass

        imgpath = osp.join(args.img_dir, content['filename'])
        out_file = osp.join(args.save_dir, content['filename']) \
                if args.save_dir else None
        if 'ann' in content:
            ann = content['ann']
            bboxes = ann['bboxes']
            labels = ann['labels']
            scores = ann.get('scores', None)
        else:
            bboxes = np.zeros((0, 4), dtype=np.float)
            labels = np.zeros((0, ), dtype=np.int)
            scores = None

        if (scores is not None) and (args.score_thr > 0):
            bboxes = bboxes[scores > args.score_thr]
            labels = labels[scores > args.score_thr]
            scores = scores[scores > args.score_thr]

        if args.skip_empty and bboxes.size == 0:
            continue

        if labels.size > 0:
            max_label = max(max_label, labels.max())
        tasks.append((imgpath, out_file, bboxes, labels, scores))

    if args.colors == 'random':
        args.colors = bt.random_colors(max_label + 1)

    if args.random_vis:
        shuffle(tasks)

    if args.save_dir and (not osp.exists(args.save_dir)):
        os.makedirs(args.save_dir)

    if args.show_off:
        plt.switch_backend('Agg')

    manager = Manager()
    _vis_func = partial(single_vis,
                        btype=args.shown_btype,
                        class_names=shown_names,
                        colors=args.colors,
                        thickness=args.thickness,
                        text_off=args.text_off,
                        font_size=args.font_size,
                        show_off=args.show_off,
                        wait_time=args.wait_time,
                        lock=manager.Lock(),
                        prog=manager.Value('i', 0),
                        total=len(tasks))
    if args.show_off and args.vis_nproc > 1:
        pool = Pool(args.vis_nproc)
        pool.map(_vis_func, tasks)
        pool.close()
    else:
        list(map(_vis_func, tasks))

    if args.save_dir:
        arg_dict = vars(args)
        arg_dict.pop('base_json', None)
        with open(osp.join(args.save_dir, 'vis_config.json'), 'w') as f:
            json.dump(arg_dict, f, indent=4)
Example #10
0
    # -------------------------------------------------------------------------------
    # Text File Parsing
    # -----------------

    # skip non txt files
    files = [(line_args.input_dir, file)
             for file in os.listdir(line_args.input_dir)
             if file.endswith(".txt")]

    # Set the number of files
    total_doc_cnt = len(files)

    # create a new manager object for the shared dictionary, lock
    m = Manager()
    inv_lock = m.Lock()
    inverted_file = m.dict()

    # Create a pool of processes
    p = Pool(line_args.process_cnt)

    # use the async mapping so every process takes of a job and start computing it
    # we don't care about the order which that happens
    r = p.map_async(parse_file, files)

    # wait for every child proceess to finish here
    p.close()
    p.join()

    # Increment returned statistics
    for p in r.get():
Example #11
0
def getIntensity(projectFile, pathTpl, view=None):
    xsocsH5 = ProjectItem(projectFile).xsocsH5

    with xsocsH5:
        entries = xsocsH5.entries()

    subject = ProgressSubject()
    tree = TreeView(view)
    tree.setShowUniqueGroup(True)
    model = Model()

    progressGroup = ProgressGroup(subject=subject, nodeName='Intensity')
    progressGroup.start()
    progressGroup.setEntries(entries)
    model.appendGroup(progressGroup)

    app = Qt.qApp

    mw = Qt.QDialog(view)
    mw.setModal(True)
    mw.setWindowTitle('Setting up data.')
    layout = Qt.QVBoxLayout(mw)
    tree.setModel(model)
    layout.addWidget(tree)
    mw.show()
    app.processEvents()

    manager = Manager()
    projectLock = manager.Lock()
    queue = manager.Queue()

    n_proc = cpu_count()

    pool = Pool(n_proc,
                maxtasksperchild=2)
    results = OrderedDict()

    for entry in entries:

        entry_f = xsocsH5.object_filename(entry)

        args = (entry,
                entry_f,
                projectLock,
                projectFile,
                pathTpl,
                queue,)

        results[entry] = pool.apply_async(_getIntensity,
                                          args)
    pool.close()

    while results:
        try:
            msg = queue.get(True, 0.01)
            if msg['done']:
                del results[msg['id']]
            subject.sigStateChanged.emit(msg)
        except queues.Empty:
            pass
        app.processEvents()

    pool.join()

    mw.close()
    mw.deleteLater()
    assert os.path.exists(data_dir), data_dir
    assert os.path.exists(wav_scp_f), wav_scp_f
    assert os.path.exists(spk2utt_f), spk2utt_f

    if data_dir != out_dir:
        print('Copy wav.scp, spk2utt, utt2spk, trials to %s' % out_dir)
        for f in ['wav.scp', 'spk2utt', 'utt2spk', 'trials']:
            orig_f = os.path.join(data_dir, f)
            targ_f = os.path.join(out_dir, f)
            if os.path.exists(orig_f):
                os.system('cp %s %s' % (orig_f, targ_f))

    start_time = time.time()

    manager = Manager()
    lock_i = manager.Lock()
    lock_t = manager.Lock()

    feat_dim = train_dir.__getitem__(1)[0].shape[-1]
    mem_data = psutil.virtual_memory()
    free_mem = mem_data.available
    maxsize = int(free_mem / (args.num_frames * feat_dim * 4) * 0.5)
    print('Maxsize for Queue is %d' % maxsize)

    task_queue = manager.Queue(maxsize=maxsize)
    idx_queue = manager.Queue()
    error_queue = manager.Queue()
    prep_jb = 3
    if args.train:

        utts = [i for i in range(len(train_dir))]
Example #13
0
        locker.release()
    except ConnectionError:
        print('Error!', url)
    finally:
        print(url, 'successed')


if __name__ == '__main__':
    start = time.clock()
    url = 'http://openaccess.thecvf.com/CVPR2018.py'
    prelink = 'http://openaccess.thecvf.com/'
    urls = geturl(url, prelink)

    pool = Pool(processes=32)
    manager = Manager()
    locker = manager.Lock()

    authors = manager.list()
    pdfs = manager.list()
    abstracts = manager.list()
    titles = manager.list()
    books = manager.list()
    months = manager.list()
    years = manager.list()

    data = (pdfs, abstracts, authors, titles, books, months, years)

    p_crawl = partial(crawl, locker, data, prelink)
    pool.map(p_crawl, urls)

    elapsed = (time.clock() - start)
Example #14
0
def auto_inc(fake):
    manager = Manager()
    return Counter(manager.Value('i', 0), manager.Lock())
Example #15
0
        line=rvline.rstrip()
        val_list=line.split('~')
        summary=val_list[0] + "\t" + val_list[2] + "\t" + val_list[3].rstrip() + "\t" + val_list[4].rstrip() + "\t" + val_list[5].rstrip()+ "\t" + val_list[6] + "\t" + val_list[7]
        
    time.sleep(1)
    L.acquire()
    refresh_slot(ns, 'R', slot, jobname, G, p, summary, returncd[absi])
    L.release()
    
#-------------------------------------------------

if __name__ == '__main__':
    mgr=Manager()
    ns=mgr.Namespace()
    G =mgr.list() #Global
    L =mgr.Lock()

    ns.M=''
    n=random.randint(900001,999999)
    m=random.randint(1,999)
    p=n*m
    start=time.time()
    gstarttm1=datetime.now().strftime('%Y.%m.%d  %H:%M:%S')
    os.system("cls")
    print("job_run Starting: " + gstarttm1)
    c=m_config('master_job_configuration.txt')
    print ("Configuration : "+c.job_list)
    print ("Parallel Slots in this run: " + c.slots)
    
    ns.max_slots=int(c.slots)
    jc          = j_config(c)
Example #16
0
 def __init__(self, manager: Manager, max_size=0):
     self.lock = manager.Lock()
     self.not_full = manager.Condition(self.lock)
     self.not_empty = manager.Condition(self.lock)
     self.closed = manager.Value(c_bool, False)
     self.queue = manager.Queue(max_size)
Example #17
0
        print(lock)
        with lock:
            ids.value = ids.value + 1
        yield 'product' + str(ids.value)


def consumer(name, ids, lock, times):
    while True:
        print(lock)
        product = next(producer(ids, lock))
        with lock:
            print(name, 'consume:', product)
        time.sleep(times)


if __name__ == '__main__':
    manager = Manager()
    lock = manager.Lock()
    ids = manager.Value('f', 1)

    p1 = Process(target=consumer, args=('A', ids, lock, 1))
    p2 = Process(target=consumer, args=('B', ids, lock, 1))
    p3 = Process(target=consumer, args=('C', ids, lock, 1))

    p1.start()
    p2.start()
    p3.start()

    p1.join()
    p2.join()
    p3.join()
Example #18
0
def verify_query_answering(queries_to_verify_count=1000):
    '''
	Randomly check some queries.
	'''
    global g_sub_q_list, g_answered_q_list, g_v, g_lock

    unansw_q_list = pickle.load(open('unanswered_cern_queries.p', 'rb'))

    length = len(unansw_q_list)

    g_sub_q_list = random.sample(
        unansw_q_list,
        queries_to_verify_count,
    )

    del unansw_q_list

    print('read unanswered queries !')

    m = Manager()

    g_v = m.value('i', 0)

    g_lock = m.Lock()

    print('first pass')
    g_answered_q_list = pickle.load(open('answered_cern_queries.p',
                                         'rb'))[:length // 4]
    print('will start pool !')
    p = Pool(n_proc)
    p.map(verify_per_proc, range(queries_to_verify_count))
    p.close()
    print()

    print('second pass')
    g_answered_q_list = pickle.load(open('answered_cern_queries.p',
                                         'rb'))[length // 4:length // 2]
    print('will start pool !')
    p = Pool(n_proc)
    p.map(verify_per_proc, range(queries_to_verify_count))
    p.close()
    print()

    print('third pass')
    g_answered_q_list = pickle.load(open('answered_cern_queries.p',
                                         'rb'))[length // 2:3 * length // 4]
    print('will start pool !')
    p = Pool(n_proc)
    p.map(verify_per_proc, range(queries_to_verify_count))
    p.close()
    print()

    print('fourth pass')
    g_answered_q_list = pickle.load(open('answered_cern_queries.p',
                                         'rb'))[3 * length // 4:]
    print('will start pool !')
    p = Pool(n_proc)
    p.map(verify_per_proc, range(queries_to_verify_count))
    p.close()
    print()

    print(g_v)
Example #19
0
def ngram_jurisdictions(slug=None, max_n=3):
    """
        Add jurisdiction specified by slug to rocksdb, or all jurisdictions if name not provided.

        This is the primary ngrams entrypoint. It spawns NGRAM_THREAD_COUNT worker processes to
        ngram each jurisdiction-year, plus a rocksdb worker process that pulls their work off of
        the queue and writes it to the database.
    """
    # process pool of workers to ngram each jurisdiction-year and return keys
    ngram_workers = Pool(settings.NGRAM_THREAD_COUNT, maxtasksperchild=1)

    # inter-process queue of returned keys
    m = Manager()
    queue = m.Queue(settings.NGRAM_THREAD_COUNT)
    ngram_worker_offsets = m.dict()
    ngram_worker_lock = m.Lock()

    # process to write keys to rocksdb
    rocksdb_loaded = m.Condition()
    rocksdb_worker = Process(target=rocksdb_writer,
                             args=(queue, rocksdb_loaded))
    rocksdb_worker.start()
    with rocksdb_loaded:
        rocksdb_loaded.wait()

    # queue each jurisdiction-year for processing
    jurisdictions = Jurisdiction.objects.all()
    if slug:
        jurisdictions = jurisdictions.filter(slug=slug)
    ngram_worker_results = []
    for jurisdiction in jurisdictions:

        # skip empty jurisdictions
        if not jurisdiction.case_metadatas.exists():
            continue

        # get year range
        case_query = CaseMetadata.objects.in_scope().filter(
            jurisdiction__slug=jurisdiction.slug)
        first_year = case_query.order_by('decision_date',
                                         'id').first().decision_date.year
        last_year = case_query.order_by('-decision_date',
                                        '-id').first().decision_date.year

        # ngram each year
        for year in range(first_year, last_year + 1):
            # ngram_worker(queue, jurisdiction_id, year, max_n)
            ngram_worker_results.append(
                (jurisdiction.slug, year,
                 ngram_workers.apply_async(
                     ngram_worker,
                     (ngram_worker_offsets, ngram_worker_lock, queue,
                      jurisdiction.id, jurisdiction.slug, year, max_n))))

    # wait for all ngram workers to finish
    ngram_workers.close()
    ngram_workers.join()

    # report failures
    for jurisdiction_slug, year, result in ngram_worker_results:
        if not result._success:
            exc = result._value
            print("%s-%s failed:" % (jurisdiction_slug, year))
            traceback.print_exception(etype=type(exc),
                                      value=exc,
                                      tb=exc.__traceback__)

    # tell rocksdb worker to exit, and wait for it to finish
    queue.put('STOP')
    rocksdb_worker.join()
Example #20
0
def multi_mode(cli_parsed):
    dbm = db_manager.DB_Manager(cli_parsed.d + '/ew.db')
    dbm.open_connection()
    if not cli_parsed.resume:
        dbm.initialize_db()
    dbm.save_options(cli_parsed)
    m = Manager()
    targets = m.Queue()
    lock = m.Lock()
    multi_counter = m.Value('i', 0)
    display = None

    def exitsig(*args):
        dbm.close()
        if current_process().name == 'MainProcess':
            print('')
            print('Resume using ./EyeWitness.py --resume {0}'.format(
                cli_parsed.d + '/ew.db'))
        os._exit(1)

    signal.signal(signal.SIGINT, exitsig)
    if cli_parsed.resume:
        pass
    else:
        url_list = target_creator(cli_parsed)
        if cli_parsed.web:
            for url in url_list:
                dbm.create_http_object(url, cli_parsed)

    if cli_parsed.web:
        if cli_parsed.web and not cli_parsed.show_selenium:
            display = Display(visible=0, size=(1920, 1080))
            display.start()

        multi_total = dbm.get_incomplete_http(targets)
        if multi_total > 0:
            if cli_parsed.resume:
                print('Resuming Web Scan ({0} Hosts Remaining)'.format(
                    str(multi_total)))
            else:
                print('Starting Web Requests ({0} Hosts)'.format(
                    str(multi_total)))

        if multi_total < cli_parsed.threads:
            num_threads = multi_total
        else:
            num_threads = cli_parsed.threads
        for i in range(num_threads):
            targets.put(None)
        try:
            workers = [
                Process(target=worker_thread,
                        args=(cli_parsed, targets, lock, (multi_counter,
                                                          multi_total)))
                for i in range(num_threads)
            ]
            for w in workers:
                w.start()
            for w in workers:
                w.join()
        except Exception as e:
            print(str(e))

        # Set up UA table here
        if cli_parsed.cycle is not None:
            ua_dict = get_ua_values(cli_parsed.cycle)
            if not cli_parsed.ua_init:
                dbm.clear_table("ua")
                completed = dbm.get_complete_http()
                completed[:] = [x for x in completed if x.error_state is None]
                for item in completed:
                    for browser, ua in ua_dict.iteritems():
                        dbm.create_ua_object(item, browser, ua)

                cli_parsed.ua_init = True
                dbm.clear_table("opts")
                dbm.save_options(cli_parsed)

            for browser, ua in ua_dict.iteritems():
                targets = m.Queue()
                multi_counter.value = 0
                multi_total = dbm.get_incomplete_ua(targets, browser)
                if multi_total > 0:
                    print("[*] Starting requests for User Agent {0}"
                          " ({1} Hosts)").format(browser, str(multi_total))
                if multi_total < cli_parsed.threads:
                    num_threads = multi_total
                else:
                    num_threads = cli_parsed.threads
                for i in range(num_threads):
                    targets.put(None)
                workers = [
                    Process(target=worker_thread,
                            args=(cli_parsed, targets, lock,
                                  (multi_counter, multi_total), (browser, ua)))
                    for i in range(num_threads)
                ]
                for w in workers:
                    w.start()
                for w in workers:
                    w.join()

    if display is not None:
        display.stop()
    results = dbm.get_complete_http()
    dbm.close()
    m.shutdown()
    sort_data_and_write(cli_parsed, results)
Example #21
0
class AnalysisScheduler:  # pylint: disable=too-many-instance-attributes
    '''
    This Scheduler performs analysis of firmware objects
    '''
    def __init__(self,
                 config: Optional[ConfigParser] = None,
                 pre_analysis=None,
                 post_analysis=None,
                 db_interface=None):
        self.config = config
        self.analysis_plugins = {}
        self.load_plugins()
        self.stop_condition = Value('i', 0)
        self.process_queue = Queue()
        self.manager = Manager()
        self.currently_running = self.manager.dict()
        self.recently_finished = self.manager.dict()
        self.currently_running_lock = self.manager.Lock()  # pylint: disable=no-member

        self.db_backend_service = db_interface if db_interface else BackEndDbInterface(
            config=config)
        self.pre_analysis = pre_analysis if pre_analysis else self.db_backend_service.add_object
        self.post_analysis = post_analysis if post_analysis else self.db_backend_service.add_analysis
        self.start_scheduling_process()
        self.start_result_collector()
        logging.info('Analysis System online...')
        logging.info('Plugins available: {}'.format(
            self.get_list_of_available_plugins()))

    def shutdown(self):
        '''
        shutdown the scheduler and all loaded plugins
        '''
        logging.debug('Shutting down...')
        self.stop_condition.value = 1
        with ThreadPoolExecutor() as executor:
            executor.submit(self.schedule_process.join)
            executor.submit(self.result_collector_process.join)
            for plugin in self.analysis_plugins:
                executor.submit(self.analysis_plugins[plugin].shutdown)
        if getattr(self.db_backend_service, 'shutdown', False):
            self.db_backend_service.shutdown()
        self.process_queue.close()
        logging.info('Analysis System offline')

    def update_analysis_of_object_and_children(self, fo: FileObject):
        '''
        This function is used to recursively analyze an object without need of the unpacker
        '''
        for included_file in self.db_backend_service.get_list_of_all_included_files(
                fo):
            child = self.db_backend_service.get_object(included_file)
            self._schedule_analysis_tasks(child, fo.scheduled_analysis)
        self.check_further_process_or_complete(fo)

    def start_analysis_of_object(self, fo: FileObject):
        '''
        This function should be used to add a new firmware object to the scheduler
        '''
        self._add_to_current_analyses(fo)
        self._schedule_analysis_tasks(fo,
                                      fo.scheduled_analysis,
                                      mandatory=True)

    def update_analysis_of_single_object(self, fo: FileObject):
        '''
        This function is used to add analysis tasks for a single file
        '''
        self._schedule_analysis_tasks(fo, fo.scheduled_analysis)

    def _schedule_analysis_tasks(self,
                                 fo,
                                 scheduled_analysis,
                                 mandatory=False):
        scheduled_analysis = self._add_dependencies_recursively(
            copy(scheduled_analysis) or [])
        fo.scheduled_analysis = self._smart_shuffle(
            scheduled_analysis +
            MANDATORY_PLUGINS if mandatory else scheduled_analysis)
        self.check_further_process_or_complete(fo)

    def _smart_shuffle(self, plugin_list: List[str]) -> List[str]:
        scheduled_plugins = []
        remaining_plugins = set(plugin_list)

        while remaining_plugins:
            next_plugins = self._get_plugins_with_met_dependencies(
                remaining_plugins, scheduled_plugins)
            if not next_plugins:
                logging.error(
                    'Error: Could not schedule plugins because dependencies cannot be fulfilled: {}'
                    .format(remaining_plugins))
                break
            scheduled_plugins[:0] = shuffled(next_plugins)
            remaining_plugins.difference_update(next_plugins)

        # assure file type is first for blacklist functionality
        if 'file_type' in scheduled_plugins and scheduled_plugins[
                -1] != 'file_type':
            scheduled_plugins.remove('file_type')
            scheduled_plugins.append('file_type')
        return scheduled_plugins

    def _get_plugins_with_met_dependencies(
            self, remaining_plugins: Set[str],
            scheduled_plugins: List[str]) -> List[str]:
        met_dependencies = scheduled_plugins
        return [
            plugin for plugin in remaining_plugins if all(
                dependency in met_dependencies
                for dependency in self.analysis_plugins[plugin].DEPENDENCIES)
        ]

    def get_list_of_available_plugins(self):
        '''
        returns a list of all loaded plugins
        '''
        plugin_list = list(self.analysis_plugins.keys())
        plugin_list.sort(key=str.lower)
        return plugin_list

# ---- internal functions ----

    def get_default_plugins_from_config(self):
        try:
            result = {}
            for plugin_set in self.config['default_plugins']:
                result[plugin_set] = read_list_from_config(
                    self.config, 'default_plugins', plugin_set)
            return result
        except (TypeError, KeyError, AttributeError):
            logging.warning('default plug-ins not set in config')
            return []

    def get_plugin_dict(self):
        '''
        returns a dictionary of plugins with the following form: names as keys and the respective description value
        {NAME: (DESCRIPTION, mandatory, default, VERSION, DEPENDENCIES, MIME_BLACKLIST, MIME_WHITELIST, config.threads)}
        - mandatory plug-ins shall not be shown in the analysis selection but always executed
        - default plug-ins shall be pre-selected in the analysis selection
        '''
        plugin_list = self.get_list_of_available_plugins()
        plugin_list = self._remove_unwanted_plugins(plugin_list)
        default_plugins = self.get_default_plugins_from_config()
        default_flag_dict = {}
        result = {}
        for plugin in plugin_list:
            mandatory_flag = plugin in MANDATORY_PLUGINS
            for key in default_plugins:
                default_flag_dict[key] = plugin in default_plugins[key]
            blacklist, whitelist = self._get_blacklist_and_whitelist_from_plugin(
                plugin)
            result[plugin] = (self.analysis_plugins[plugin].DESCRIPTION,
                              mandatory_flag, dict(default_flag_dict),
                              self.analysis_plugins[plugin].VERSION,
                              self.analysis_plugins[plugin].DEPENDENCIES,
                              blacklist, whitelist,
                              self.config[plugin].get('threads', 0))
        result['unpacker'] = (
            'Additional information provided by the unpacker', True, False)
        return result

# ---- scheduling functions ----

    def get_scheduled_workload(self):
        self._clear_recently_finished()
        workload = {
            'analysis_main_scheduler': self.process_queue.qsize(),
            'plugins': {},
            'current_analyses': self._get_current_analyses_stats(),
            'recently_finished_analyses': dict(self.recently_finished),
        }
        for plugin_name in self.analysis_plugins:
            plugin = self.analysis_plugins[plugin_name]
            workload['plugins'][plugin_name] = {
                'queue':
                plugin.in_queue.qsize(),
                'active': (sum(plugin.active[i].value
                               for i in range(plugin.thread_count))),
            }
        return workload

    def _get_current_analyses_stats(self):
        return {
            uid: {
                'unpacked_count': stats_dict['unpacked_files_count'],
                'analyzed_count': stats_dict['analyzed_files_count'],
                'start_time': stats_dict['start_time'],
                'total_count': stats_dict['total_files_count'],
            }
            for uid, stats_dict in self.currently_running.items()
        }

    def register_plugin(self, name, plugin_instance):
        '''
        This function is called upon plugin init to announce its presence
        '''
        self.analysis_plugins[name] = plugin_instance

    def load_plugins(self):
        source = import_plugins('analysis.plugins', 'plugins/analysis')
        for plugin_name in source.list_plugins():
            plugin = source.load_plugin(plugin_name)
            plugin.AnalysisPlugin(self, config=self.config)

    def start_scheduling_process(self):
        logging.debug('Starting scheduler...')
        self.schedule_process = ExceptionSafeProcess(target=self.scheduler)
        self.schedule_process.start()

    def scheduler(self):
        while self.stop_condition.value == 0:
            try:
                task = self.process_queue.get(timeout=float(
                    self.config['ExpertSettings']['block_delay']))
            except Empty:
                pass
            else:
                self.process_next_analysis(task)

    def _reschedule_failed_analysis_task(self, fw_object: Union[Firmware,
                                                                FileObject]):
        failed_plugin, cause = fw_object.analysis_exception
        fw_object.processed_analysis[failed_plugin] = {'failed': cause}
        for plugin in fw_object.scheduled_analysis[:]:
            if failed_plugin in self.analysis_plugins[plugin].DEPENDENCIES:
                fw_object.scheduled_analysis.remove(plugin)
                logging.warning(
                    'Unscheduled analysis {} for {} because dependency {} failed'
                    .format(plugin, fw_object.uid, failed_plugin))
                fw_object.processed_analysis[plugin] = {
                    'failed':
                    'Analysis of dependency {} failed'.format(failed_plugin)
                }
        fw_object.analysis_exception = None

    # ---- analysis skipping ----

    def process_next_analysis(self, fw_object: FileObject):
        self.pre_analysis(fw_object)
        analysis_to_do = fw_object.scheduled_analysis.pop()
        if analysis_to_do not in self.analysis_plugins:
            logging.error('Plugin \'{}\' not available'.format(analysis_to_do))
            self.check_further_process_or_complete(fw_object)
        else:
            self._start_or_skip_analysis(analysis_to_do, fw_object)

    def _start_or_skip_analysis(self, analysis_to_do: str,
                                file_object: FileObject):
        if self._analysis_is_already_in_db_and_up_to_date(
                analysis_to_do, file_object.uid):
            logging.debug(
                'skipping analysis "{}" for {} (analysis already in DB)'.
                format(analysis_to_do, file_object.uid))
            if analysis_to_do in self._get_cumulative_remaining_dependencies(
                    file_object.scheduled_analysis):
                self._add_completed_analysis_results_to_file_object(
                    analysis_to_do, file_object)
            self.check_further_process_or_complete(file_object)
        elif analysis_to_do not in MANDATORY_PLUGINS and self._next_analysis_is_blacklisted(
                analysis_to_do, file_object):
            logging.debug(
                'skipping analysis "{}" for {} (blacklisted file type)'.format(
                    analysis_to_do, file_object.uid))
            file_object.processed_analysis[
                analysis_to_do] = self._get_skipped_analysis_result(
                    analysis_to_do)
            self.post_analysis(file_object)
            self.check_further_process_or_complete(file_object)
        else:
            self.analysis_plugins[analysis_to_do].add_job(file_object)

    def _add_completed_analysis_results_to_file_object(self,
                                                       analysis_to_do: str,
                                                       fw_object: FileObject):
        db_entry = self.db_backend_service.get_specific_fields_of_db_entry(
            fw_object.uid, {'processed_analysis.{}'.format(analysis_to_do): 1})
        desanitized_analysis = self.db_backend_service.retrieve_analysis(
            db_entry['processed_analysis'])
        fw_object.processed_analysis[analysis_to_do] = desanitized_analysis[
            analysis_to_do]

    def _analysis_is_already_in_db_and_up_to_date(self, analysis_to_do: str,
                                                  uid: str):
        db_entry = self.db_backend_service.get_specific_fields_of_db_entry(
            uid, {
                'processed_analysis.{plugin}.{key}'.format(
                    plugin=analysis_to_do, key=key): 1
                for key in [
                    'failed', 'file_system_flag', 'plugin_version',
                    'system_version'
                ]
            })
        if not db_entry or analysis_to_do not in db_entry[
                'processed_analysis'] or 'failed' in db_entry[
                    'processed_analysis'][analysis_to_do]:
            return False
        if 'plugin_version' not in db_entry['processed_analysis'][
                analysis_to_do]:
            logging.error('Plugin Version missing: UID: {}, Plugin: {}'.format(
                uid, analysis_to_do))
            return False

        if db_entry['processed_analysis'][analysis_to_do]['file_system_flag']:
            db_entry[
                'processed_analysis'] = self.db_backend_service.retrieve_analysis(
                    db_entry['processed_analysis'],
                    analysis_filter=[analysis_to_do])
            if 'file_system_flag' in db_entry['processed_analysis'][
                    analysis_to_do]:
                logging.warning('Desanitization of version string failed')
                return False

        return self._analysis_is_up_to_date(
            db_entry['processed_analysis'][analysis_to_do],
            self.analysis_plugins[analysis_to_do])

    @staticmethod
    def _analysis_is_up_to_date(analysis_db_entry: dict,
                                analysis_plugin: AnalysisBasePlugin):
        old_plugin_version = analysis_db_entry['plugin_version']
        old_system_version = analysis_db_entry.get('system_version', None)
        current_plugin_version = analysis_plugin.VERSION
        current_system_version = getattr(analysis_plugin, 'SYSTEM_VERSION',
                                         None)
        try:
            if LooseVersion(old_plugin_version) < LooseVersion(current_plugin_version) or \
                    LooseVersion(old_system_version or '0') < LooseVersion(current_system_version or '0'):
                return False
        except TypeError:
            logging.error(
                'plug-in or system version of "{}" plug-in is or was invalid!'.
                format(analysis_plugin.NAME))
            return False
        return True

# ---- blacklist and whitelist ----

    def _get_skipped_analysis_result(self, analysis_to_do):
        return {
            'skipped': 'blacklisted file type',
            'summary': [],
            'analysis_date': time(),
            'plugin_version': self.analysis_plugins[analysis_to_do].VERSION
        }

    def _next_analysis_is_blacklisted(self, next_analysis: str,
                                      fw_object: FileObject):
        blacklist, whitelist = self._get_blacklist_and_whitelist(next_analysis)
        if not (blacklist or whitelist):
            return False
        if blacklist and whitelist:
            message = color_string(
                'Configuration of plugin "{}" erroneous'.format(next_analysis),
                TerminalColors.FAIL)
            logging.error(
                '{}: found blacklist and whitelist. Ignoring blacklist.'.
                format(message))

        file_type = self._get_file_type_from_object_or_db(fw_object)

        if whitelist:
            return not substring_is_in_list(file_type, whitelist)
        return substring_is_in_list(file_type, blacklist)

    def _get_file_type_from_object_or_db(
            self, fw_object: FileObject) -> Optional[str]:
        if 'file_type' not in fw_object.processed_analysis:
            self._add_completed_analysis_results_to_file_object(
                'file_type', fw_object)

        return fw_object.processed_analysis['file_type']['mime'].lower()

    def _get_blacklist_and_whitelist(self,
                                     next_analysis: str) -> Tuple[List, List]:
        blacklist, whitelist = self._get_blacklist_and_whitelist_from_config(
            next_analysis)
        if not (blacklist or whitelist):
            blacklist, whitelist = self._get_blacklist_and_whitelist_from_plugin(
                next_analysis)
        return blacklist, whitelist

    def _get_blacklist_and_whitelist_from_config(
            self, analysis_plugin: str) -> Tuple[List, List]:
        blacklist = read_list_from_config(self.config, analysis_plugin,
                                          'mime_blacklist')
        whitelist = read_list_from_config(self.config, analysis_plugin,
                                          'mime_whitelist')
        return blacklist, whitelist

# ---- result collector functions ----

    def _get_blacklist_and_whitelist_from_plugin(
            self, analysis_plugin: str) -> Tuple[List, List]:
        blacklist = getattr(self.analysis_plugins[analysis_plugin],
                            'MIME_BLACKLIST', [])
        whitelist = getattr(self.analysis_plugins[analysis_plugin],
                            'MIME_WHITELIST', [])
        return blacklist, whitelist

    def start_result_collector(self):
        logging.debug('Starting result collector')
        self.result_collector_process = ExceptionSafeProcess(
            target=self.result_collector)
        self.result_collector_process.start()

# ---- miscellaneous functions ----

    def result_collector(self):  # pylint: disable=too-complex
        while self.stop_condition.value == 0:
            nop = True
            for plugin in self.analysis_plugins:
                try:
                    fw = self.analysis_plugins[plugin].out_queue.get_nowait()
                except Empty:
                    pass
                else:
                    nop = False
                    if plugin in fw.processed_analysis:
                        if fw.analysis_exception:
                            self._reschedule_failed_analysis_task(fw)

                        self.post_analysis(fw)
                    self.check_further_process_or_complete(fw)
            if nop:
                sleep(float(self.config['ExpertSettings']['block_delay']))

    def check_further_process_or_complete(self, fw_object):
        if not fw_object.scheduled_analysis:
            logging.info('Analysis Completed:\n{}'.format(fw_object))
            self._remove_from_current_analyses(fw_object)
        else:
            self.process_queue.put(fw_object)

    @staticmethod
    def _remove_unwanted_plugins(list_of_plugins):
        defaults = ['dummy_plugin_for_testing_only']
        for plugin in defaults:
            list_of_plugins.remove(plugin)
        return list_of_plugins

    def check_exceptions(self):
        for _, plugin in self.analysis_plugins.items():
            if plugin.check_exceptions():
                return True
        return check_worker_exceptions(
            [self.schedule_process, self.result_collector_process],
            'Scheduler')

    def _add_dependencies_recursively(
            self, scheduled_analyses: List[str]) -> List[str]:
        scheduled_analyses_set = set(scheduled_analyses)
        while True:
            new_dependencies = self._get_cumulative_remaining_dependencies(
                scheduled_analyses_set)
            if not new_dependencies:
                break
            scheduled_analyses_set.update(new_dependencies)
        return list(scheduled_analyses_set)

    def _get_cumulative_remaining_dependencies(
            self, scheduled_analyses: Set[str]) -> Set[str]:
        return {
            dependency
            for plugin in scheduled_analyses
            for dependency in self.analysis_plugins[plugin].DEPENDENCIES
        }.difference(scheduled_analyses)

    # currently running analyses

    def _add_to_current_analyses(self, fw_object: Union[Firmware, FileObject]):
        self.currently_running_lock.acquire()
        try:
            if isinstance(fw_object, Firmware):
                self.currently_running[
                    fw_object.uid] = self._init_current_analysis(fw_object)
            else:
                self._update_current_analysis(fw_object)
        finally:
            self.currently_running_lock.release()

    def _update_current_analysis(self, fw_object):
        '''
        new file comes from unpacking:
        - file moved from files_to_unpack to files_to_analyze (could be duplicate!)
        - included files added to files_to_unpack (could also include duplicates!)
        '''
        for parent in self._find_currently_analyzed_parents(fw_object):
            updated_dict = self.currently_running[parent]
            new_files = set(fw_object.files_included) - set(
                updated_dict['files_to_unpack']).union(
                    set(updated_dict['files_to_analyze']))
            updated_dict['total_files_count'] += len(new_files)
            updated_dict['files_to_unpack'] = list(
                set(updated_dict['files_to_unpack']).union(new_files))
            if fw_object.uid in updated_dict['files_to_unpack']:
                updated_dict['files_to_unpack'].remove(fw_object.uid)
                updated_dict['files_to_analyze'].append(fw_object.uid)
                updated_dict['unpacked_files_count'] += 1
            self.currently_running[parent] = updated_dict

    @staticmethod
    def _init_current_analysis(fw_object):
        return {
            'files_to_unpack': list(fw_object.files_included),
            'files_to_analyze': [fw_object.uid],
            'start_time': time(),
            'unpacked_files_count': 1,
            'analyzed_files_count': 0,
            'total_files_count': 1 + len(fw_object.files_included),
        }

    def _remove_from_current_analyses(self, fw_object: Union[Firmware,
                                                             FileObject]):
        try:
            self.currently_running_lock.acquire()
            for parent in self._find_currently_analyzed_parents(fw_object):
                updated_dict = self.currently_running[parent]
                if fw_object.uid not in updated_dict['files_to_analyze']:
                    logging.warning(
                        'Trying to remove {} from current analysis of {} but it is not included'
                        .format(fw_object.uid, parent))
                    continue
                updated_dict['files_to_analyze'] = list(
                    set(updated_dict['files_to_analyze']) - {fw_object.uid})
                updated_dict['analyzed_files_count'] += 1
                if len(updated_dict['files_to_unpack']) == len(
                        updated_dict['files_to_analyze']) == 0:
                    self.recently_finished[
                        parent] = self._init_recently_finished(updated_dict)
                    self.currently_running.pop(parent)
                    logging.info(
                        'Analysis of firmware {} completed'.format(parent))
                else:
                    self.currently_running[parent] = updated_dict
        finally:
            self.currently_running_lock.release()

    @staticmethod
    def _init_recently_finished(analysis_data: dict) -> dict:
        return {
            'duration': time() - analysis_data['start_time'],
            'total_files_count': analysis_data['total_files_count'],
            'time_finished': time(),
        }

    def _find_currently_analyzed_parents(
            self, fw_object: Union[Firmware, FileObject]) -> Set[str]:
        parent_uids = {fw_object.uid} if isinstance(
            fw_object, Firmware) else fw_object.parent_firmware_uids
        return set(self.currently_running.keys()).intersection(parent_uids)

    def _clear_recently_finished(self):
        for uid, stats in list(self.recently_finished.items()):
            if time(
            ) - stats['time_finished'] > RECENTLY_FINISHED_DISPLAY_TIME_IN_SEC:
                self.recently_finished.pop(uid)
Example #22
0
class TestManager:
    """
    Class manages the testers and helps them to execute steps in synchronized order.
    Each tester is executed in a separeted process.
    """
    def __init__(self):
        self.manager = Manager()
        self.lock = self.manager.Lock()
        self.process_done = self.manager.Semaphore(0)
        self.queue = self.manager.Queue()
        self.sub_proc = self.manager.Queue()
        self._setup()

    def _setup(self):
        self.testers = []
        self.next_steps = []
        self.proc_ids = []
        self.subprocToKill = []

    def add_tester(self, tester):
        self.testers.append(tester)

    def start_processes(self, rand_sleep):
        """create process for each tester"""
        self.pids = self.manager.Array('l', range(len(self.testers)))

        for id in range(len(self.testers)):
            self.process_done.release()
            next_s = self.manager.Semaphore(0)

            p = Process(target=self.testers[id].run,
                        args=(self.process_done, next_s, rand_sleep, self.lock,
                              self.sub_proc, self.pids, id, self.queue))
            self.proc_ids.append(p)
            self.next_steps.append(next_s)
            p.start()
            self.pids[id] = p.pid

    def wait_for_processes(self):
        """wait for all process to finish"""
        for p in self.proc_ids:
            p.join()
            p.terminate()

        self.lock.acquire()
        print("end")
        self.lock.release()

    def run(self, rand_sleep=True):
        """Execute tester steps"""
        self.start_processes(rand_sleep)

        step = -1
        will_continue = range(len(self.next_steps))
        wait_for = range(len(self.next_steps))
        while True:
            if step >= 0:
                print("\n\n=================== TestManager step",
                      step,
                      "testers:",
                      wait_for,
                      file=sys.stderr)
            for _ in wait_for:
                self.process_done.acquire()
                if step >= 0:
                    proc, name, status = self.queue.get()
                    print(("Received ", proc, name, status), file=sys.stderr)
                    if status == True:
                        will_continue.append(proc)
                    elif isinstance(status, BaseException):
                        print("Error in tester", proc, name, "step", step)
                        for p in self.proc_ids:
                            p.terminate()
                        while not self.sub_proc.empty():
                            pid = self.sub_proc.get()
                            try:
                                os.kill(pid, signal.SIGKILL)
                            except:
                                pass
                        raise status

            if len(will_continue) == 0:
                break

            for id in will_continue:
                self.next_steps[id].release()

            wait_for = will_continue[:]
            will_continue = []
            step += 1

        self.wait_for_processes()
Example #23
0
def continue_shadowing(
        run, parameter, checkpoint,
        num_segments, steps_per_segment, epsilon=1E-6,
        checkpoint_path=None, checkpoint_interval=1, simultaneous_runs=None,
        run_ddt=None, return_checkpoint=False, get_host_dir=None, spawn_compute_job=None):
    """
    """
    compute_outputs = []

    run = RunWrapper(run)
    assert verify_checkpoint(checkpoint)
    u0, V, v, lss, G_lss, g_lss, J_hist, G_dil, g_dil = checkpoint

    manager = Manager()
    interprocess = (manager.Lock(), manager.dict())

    i = lss.K_segments()
    run_id = 'time_dilation_{0:02d}'.format(i)
    if run_ddt is not None:
        time_dil = TimeDilationExact(run_ddt, u0, parameter)
    else:
        time_dil = TimeDilation(run, u0, parameter, run_id,
                                simultaneous_runs, interprocess)

    # V = time_dil.project(V)
    # v = time_dil.project(v)

    u0, V, v, J0, G, g = run_segment(
            run, u0, V, v, parameter, i, steps_per_segment,
            epsilon, simultaneous_runs, interprocess, get_host_dir=get_host_dir,
            compute_outputs=compute_outputs, spawn_compute_job=spawn_compute_job)

    J_hist.append(J0)
    G_lss.append(G)
    g_lss.append(g)

    for i in range(lss.K_segments() + 1, num_segments + 1):

        # time dilation contribution
        run_id = 'time_dilation_{0:02d}'.format(i)
        if run_ddt is not None:
            time_dil = TimeDilationExact(run_ddt, u0, parameter)
        else:
            time_dil = TimeDilation(run, u0, parameter, run_id,
                                    simultaneous_runs, interprocess)
        G_dil.append(time_dil.contribution(V))
        g_dil.append(time_dil.contribution(v))

        # V = time_dil.project(V)
        # v = time_dil.project(v)

        V, v = lss.checkpoint(V, v)
        # extra outputs to compute
        compute_outputs = [lss.Rs[-1], lss.bs[-1], G_dil[-1], g_dil[-1]]

        # run all segments
        if i < num_segments:
            u0, V, v, J0, G, g = run_segment(
                    run, u0, V, v, parameter, i, steps_per_segment,
                    epsilon, simultaneous_runs, interprocess, get_host_dir=get_host_dir,
                    compute_outputs=compute_outputs, spawn_compute_job=spawn_compute_job)
        else:
            run_compute(compute_outputs, spawn_compute_job=spawn_compute_job, interprocess=interprocess)

        for output in [lss.Rs, lss.bs, G_dil, g_dil]:
            output[-1] = output[-1].field

        checkpoint = Checkpoint(
                u0, V, v, lss, G_lss, g_lss, J_hist, G_dil, g_dil)
        print(lss_gradient(checkpoint))
        sys.stdout.flush()

        if checkpoint_path and (i) % checkpoint_interval == 0:
            save_checkpoint(checkpoint_path, checkpoint)

        if i < num_segments:
            J_hist.append(J0)
            G_lss.append(G)
            g_lss.append(g)

    if return_checkpoint:
        return checkpoint
    else:
        G = lss_gradient(checkpoint)
        return np.array(J_hist).mean((0,1)), G
Example #24
0
class SeldonMetrics:
    """Class to manage custom metrics stored in shared memory."""
    def __init__(self, worker_id_func=os.getpid):
        # We keep reference to Manager so it does not get garbage collected
        self._manager = Manager()
        self._lock = self._manager.Lock()
        self.data = self._manager.dict()
        self.worker_id_func = worker_id_func

    def __del__(self):
        self._manager.shutdown()

    def update_reward(self, reward: float):
        """"Update metrics key corresponding to feedback reward counter."""
        if not reward or legacy_mode:
            return
        self.update(
            [{
                "type": "COUNTER",
                "key": FEEDBACK_KEY,
                "value": 1
            }],
            FEEDBACK_METRIC_METHOD_TAG,
        )
        self.update(
            [{
                "type": "COUNTER",
                "key": FEEDBACK_REWARD_KEY,
                "value": reward
            }],
            FEEDBACK_METRIC_METHOD_TAG,
        )

    def update(self, custom_metrics: List[Dict], method: str):
        # Read a corresponding worker's metric data with lock as Proxy objects
        # are not thread-safe, see "Thread safety of proxies" here
        # https://docs.python.org/3.7/library/multiprocessing.html#programming-guidelines
        logger.debug("Updating metrics: {}".format(custom_metrics))
        with self._lock:
            data = self.data.get(self.worker_id_func(), {})
        logger.debug("Read current metrics data from shared memory")

        for metrics in custom_metrics:
            metrics_type = metrics.get("type", "COUNTER")
            key = metrics_type, metrics["key"]
            tags = metrics.get("tags", {})
            # Add tag that specifies which method added the metrics
            tags["method"] = method
            if metrics_type == "COUNTER":
                value = data.get(key, {}).get("value", 0)
                data[key] = {"value": value + metrics["value"], "tags": tags}
            elif metrics_type == "TIMER":
                vals, sumv = data.get(key, {}).get(
                    "value", (list(np.zeros(len(BINS) - 1)), 0))
                # Dividing by 1000 because unit is milliseconds
                data[key] = {
                    "value": self._update_hist(metrics["value"] / 1000, vals,
                                               sumv),
                    "tags": tags,
                }
            elif metrics_type == "GAUGE":
                data[key] = {"value": metrics["value"], "tags": tags}
            else:
                logger.error(f"Unkown metrics type: {metrics_type}")

        # Write worker's data with lock (again - Proxy objects are not thread-safe)
        with self._lock:
            self.data[self.worker_id_func()] = data
        logger.debug("Updated metrics in the shared memory.")

    def collect(self):
        # Read all workers metrics with lock to avoid other processes / threads
        # writing to it at the same time. Casting to `dict` works like reading of data.
        logger.debug("SeldonMetrics.collect called")
        with self._lock:
            data = dict(self.data)
        logger.debug("Read current metrics data from shared memory")

        for worker, metrics in data.items():
            for (item_type, item_name), item in metrics.items():
                labels_keys, labels_values = self._merge_labels(
                    str(worker), item["tags"])
                if item_type == "GAUGE":
                    yield self._expose_gauge(item_name, item["value"],
                                             labels_keys, labels_values)
                elif item_type == "COUNTER":
                    yield self._expose_counter(item_name, item["value"],
                                               labels_keys, labels_values)
                elif item_type == "TIMER":
                    yield self._expose_histogram(item_name, item["value"],
                                                 labels_keys, labels_values)

    def generate_metrics(self):
        myregistry = CollectorRegistry()
        myregistry.register(self)
        return (
            exposition.generate_latest(myregistry).decode("utf-8"),
            exposition.CONTENT_TYPE_LATEST,
        )

    @staticmethod
    def _merge_labels(worker, tags):
        labels = {**tags, **DEFAULT_LABELS, "worker_id": str(worker)}
        return list(labels.keys()), list(labels.values())

    @staticmethod
    def _update_hist(x, vals, sumv):
        hist = np.histogram([x], BINS)[0]
        vals = list(np.array(vals) + hist)
        return vals, sumv + x

    @staticmethod
    def _expose_gauge(name, value, labels_keys, labels_values):
        metric = GaugeMetricFamily(name, "", labels=labels_keys)
        metric.add_metric(labels_values, value)
        return metric

    @staticmethod
    def _expose_counter(name, value, labels_keys, labels_values):
        metric = CounterMetricFamily(name, "", labels=labels_keys)
        metric.add_metric(labels_values, value)
        return metric

    @staticmethod
    def _expose_histogram(name, value, labels_keys, labels_values):
        vals, sumv = value
        buckets = [[floatToGoString(b), v]
                   for v, b in zip(np.cumsum(vals), BINS[1:])]

        metric = HistogramMetricFamily(name, "", labels=labels_keys)
        metric.add_metric(labels_values, buckets, sum_value=sumv)
        return metric
Example #25
0
async def schedule_formatting(
    sources: Set[Path],
    fast: bool,
    write_back: WriteBack,
    mode: Mode,
    report: "Report",
    loop: asyncio.AbstractEventLoop,
    executor: Executor,
) -> None:
    """Run formatting of `sources` in parallel using the provided `executor`.

    (Use ProcessPoolExecutors for actual parallelism.)

    `write_back`, `fast`, and `mode` options are passed to
    :func:`format_file_in_place`.
    """
    cache: Cache = {}
    if write_back not in (WriteBack.DIFF, WriteBack.COLOR_DIFF):
        cache = read_cache(mode)
        sources, cached = filter_cached(cache, sources)
        for src in sorted(cached):
            report.done(src, Changed.CACHED)
    if not sources:
        return

    cancelled = []
    sources_to_cache = []
    lock = None
    if write_back in (WriteBack.DIFF, WriteBack.COLOR_DIFF):
        # For diff output, we need locks to ensure we don't interleave output
        # from different processes.
        manager = Manager()
        lock = manager.Lock()
    tasks = {
        asyncio.ensure_future(
            loop.run_in_executor(executor, format_file_in_place, src, fast,
                                 mode, write_back, lock)): src
        for src in sorted(sources)
    }
    pending = tasks.keys()
    try:
        loop.add_signal_handler(signal.SIGINT, cancel, pending)
        loop.add_signal_handler(signal.SIGTERM, cancel, pending)
    except NotImplementedError:
        # There are no good alternatives for these on Windows.
        pass
    while pending:
        done, _ = await asyncio.wait(pending,
                                     return_when=asyncio.FIRST_COMPLETED)
        for task in done:
            src = tasks.pop(task)
            if task.cancelled():
                cancelled.append(task)
            elif task.exception():
                report.failed(src, str(task.exception()))
            else:
                changed = Changed.YES if task.result() else Changed.NO
                # If the file was written back or was successfully checked as
                # well-formatted, store this information in the cache.
                if write_back is WriteBack.YES or (
                        write_back is WriteBack.CHECK
                        and changed is Changed.NO):
                    sources_to_cache.append(src)
                report.done(src, changed)
    if cancelled:
        if sys.version_info >= (3, 7):
            await asyncio.gather(*cancelled, return_exceptions=True)
        else:
            await asyncio.gather(*cancelled, loop=loop, return_exceptions=True)
    if sources_to_cache:
        write_cache(cache, sources_to_cache, mode)
Example #26
0
class BaseScraper(ABC, Logger):
    """Base scraper object, for scraping and filtering Jobs from a provider
    """

    def __init__(self, session: Session, config: 'JobFunnelConfigManager',
                 job_filter: JobFilter) -> None:
        """Init

        Args:
            session (Session): session object used to make post and get requests
            config (JobFunnelConfigManager): config containing all needed paths,
                search proxy, delaying and other metadata.
            job_filter (JobFilter): object for filtering incoming jobs using
                various internal filters, including a content-matching tool.
                NOTE: this runs-on-the-fly as well, and preempts un-promising
                job scrapes to minimize session() usage.

        Raises:
            ValueError: if no Locale is configured in the JobFunnelConfigManager
        """
        # Inits
        super().__init__(level=config.log_level, file_path=config.log_file)
        self.job_filter = job_filter
        self.session = session
        self.config = config
        if self.headers:
            self.session.headers.update(self.headers)

        # Elongate the retries TODO: make configurable
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        self.session.mount('http://', adapter)
        self.session.mount('https://', adapter)

        # Ensure that the locale we want to use matches the locale that the
        # scraper was written to scrape in:
        if self.config.search_config.locale != self.locale:
            raise ValueError(
                f"Attempting to use scraper designed for {self.locale.name} "
                "when config indicates user is searching with "
                f"{self.config.search_config.locale.name}"
            )

        # Ensure our properties satisfy constraints
        self._validate_get_set()
        self.thread_manager = Manager()

        # Construct actions list which respects priority for scraping Jobs
        self._actions_list = [(True, f) for f in self.job_get_fields]
        self._actions_list += [(False, f) for f in self.job_set_fields if f
                               in self.high_priority_get_set_fields]
        self._actions_list += [(False, f) for f in self.job_set_fields if f not
                               in self.high_priority_get_set_fields]

    @property
    def user_agent(self) -> str:
        """Get a randomized user agent for this scraper
        """
        return random.choice(USER_AGENT_LIST)

    @property
    def job_init_kwargs(self) -> Dict[JobField, Any]:
        """This is a helper property that stores a Dict of JobField : value that
        we set defaults for when scraping. If the scraper fails to get/set these
        we can fail back to the empty value from here.

        i.e. JobField.POST_DATE defaults to today.
        TODO: formalize the defaults for JobFields via Job.__init__(Jobfields...
        """
        return {
            JobField.STATUS: JobStatus.NEW,
            JobField.LOCALE: self.locale,
            JobField.QUERY: self.config.search_config.query_string,
            JobField.DESCRIPTION: '',
            JobField.URL: '',
            JobField.SHORT_DESCRIPTION: '',
            JobField.RAW: None,
            JobField.PROVIDER: self.__class__.__name__,
            JobField.REMOTENESS: Remoteness.UNKNOWN,
            JobField.WAGE: '',
        }

    @property
    def min_required_job_fields(self) -> List[JobField]:
        """If we dont get() or set() any of these fields, we will raise an
        exception instead of continuing without that information.

        NOTE: pointless to check for locale / provider / other defaults

        Override if needed, but be aware that key_id should always be populated
        along with URL or the user can do nothing with the result.
        """
        return [
            JobField.TITLE, JobField.COMPANY, JobField.LOCATION,
            JobField.KEY_ID, JobField.URL
        ]

    @property
    def high_priority_get_set_fields(self) -> List[JobField]:
        """These get() and/or set() fields will be populated first.

        i.e we need the RAW populated before DESCRIPTION, so RAW should be high.
        i.e. we need to get key_id before we set job.url, so key_id is high.

        NOTE: override as needed.
        """
        return []

    @property
    @abstractmethod
    def job_get_fields(self) -> List[JobField]:
        """Call self.get(...) for the JobFields in this list when scraping a Job.

        NOTE: these will be passed job listing soups, if you have data you need
        to populate that exists in the Job.RAW (the soup from the listing's own
        page), you should use job_set_fields.
        """

    @property
    @abstractmethod
    def job_set_fields(self) -> List[JobField]:
        """Call self.set(...) for the JobFields in this list when scraping a Job

        NOTE: You should generally set the job's own page as soup to RAW first
        and then populate other fields from this soup, or from each-other here.
        """

    @property
    @abstractmethod
    def delayed_get_set_fields(self) -> List[JobField]:
        """Delay execution when getting /setting any of these attributes of a
        job.

        TODO: handle this within an overridden self.session.get()
        """

    @property
    @abstractmethod
    def locale(self) -> Locale:
        """The localization that this scraper was built for.

        i.e. I am looking for jobs on the Canadian version of Indeed, and I
        speak english, so I will have this return Locale.CANADA_ENGLISH

        We will use this to put the right filters & scrapers together

        NOTE: it is best to inherit this from Base<Locale>Class (btm. of file)
        """

    @property
    @abstractmethod
    def headers(self) -> Dict[str, str]:
        """The Session headers for this scraper to be used with
        requests.Session.headers.update()
        """

    def scrape(self) -> Dict[str, Job]:
        """Scrape job source into a dict of unique jobs keyed by ID

        Returns:
            jobs (Dict[str, Job]): list of Jobs in a Dict keyed by job.key_id
        """

        # Get a list of job soups from the initial search results page
        # These wont contain enough information to do more than initialize Job
        try:
            job_soups = self.get_job_soups_from_search_result_listings()
        except Exception as err:
            raise ValueError(
                "Unable to extract jobs from initial search result page:\n\t"
                f"{str(err)}"
            )
        n_soups = len(job_soups)
        self.logger.info(
            "Scraped %s job listings from search results pages", n_soups
        )

        # Init a Manager so we can control delaying
        # this is assuming every job will incur one delayed session.get()
        # NOTE pylint issue: https://github.com/PyCQA/pylint/issues/3313
        delay_lock = self.thread_manager.Lock()  # pylint: disable=no-member
        threads = ThreadPoolExecutor(max_workers=MAX_CPU_WORKERS)

        # Distribute work to N workers such that each worker is building one
        # Job at a time, getting and setting all required attributes
        jobs_dict = {}  # type: Dict[str, Job]
        try:
            # Calculate delays for get/set calls per-job NOTE: only get/set
            # calls in self.delayed_get_set_fields will be delayed.
            # and it busy-waits.
            delays = calculate_delays(n_soups, self.config.delay_config)
            futures = []
            for job_soup, delay in zip(job_soups, delays):
                futures.append(
                    threads.submit(
                        self.scrape_job,
                        job_soup=job_soup,
                        delay=delay,
                        delay_lock=delay_lock,
                    )
                )

            # For each job-soup object, scrape the soup into a Job (w/o desc.)
            for future in tqdm(as_completed(futures), total=n_soups):
                job = future.result()
                if job:
                    # Handle inter-scraped data duplicates by key.
                    # TODO: move this functionality into duplicates filter
                    if job.key_id in jobs_dict:
                        self.logger.error(
                            "Job %s and %s share duplicate key_id: %s",
                            job.title, jobs_dict[job.key_id].title, job.key_id
                        )
                    else:
                        jobs_dict[job.key_id] = job

        finally:
            # Cleanup
            threads.shutdown()

        return jobs_dict

    # pylint: disable=no-member
    def scrape_job(self, job_soup: BeautifulSoup, delay: float,
                   delay_lock: Optional[Lock] = None) -> Optional[Job]:
        """Scrapes a search page and get a list of soups that will yield jobs
        Arguments:
            job_soup (BeautifulSoup): This is a soup object that your get/set
                will use to perform the get/set action. It should be specific
                to this job and not contain other job information.
            delay (float): how long to delay getting/setting for certain
                get/set calls while scraping data for this job.
            delay_lock (Optional[Manager.Lock], optional): semaphore for
                synchronizing respectful delaying across workers

        NOTE: this will never raise an exception to prevent killing workers,
            who are building jobs sequentially.

        Returns:
            Optional[Job]: job object constructed from the soup and localization
                of class, returns None if scrape failed.
        """
        # Scrape the data for the post, requiring a minimum of info...
        # NOTE: if we perform a self.session.get we may get respectfully delayed
        job = None  # type: Optional[Job]
        job_init_kwargs = self.job_init_kwargs  # NOTE: faster?
        for is_get, field in self._actions_list:

            # Break out immediately because we have failed a filterable
            # condition with something we initialized while scraping.
            if job and self.job_filter.filterable(job):
                if self.job_filter.is_duplicate(job):
                    # NOTE: if we pre-empt scraping duplicates we cannot update
                    # the existing job listing with the new information!
                    # TODO: make this behaviour configurable? ('minimal-get' ?)
                    self.logger.debug(
                        "Scraped job %s has key_id in known duplicates list. "
                        "Continuing scrape of job to update existing job "
                        "attributes.",
                        job.key_id
                    )
                else:
                    self.logger.debug(
                        "Cancelled scraping of %s, failed JobFilter",
                        job.key_id
                    )
                    break

            # Respectfully delay if it's configured to do so.
            if field in self.delayed_get_set_fields:
                if delay_lock:
                    self.logger.debug("Delaying for %.4f", delay)
                    with delay_lock:
                        sleep(delay)
                else:
                    sleep(delay)

            try:
                if is_get:
                    job_init_kwargs[field] = self.get(field, job_soup)
                else:
                    if not job:
                        # Build initial job object + populate all the job
                        job = Job(**{
                            k.name.lower(): v for k, v
                            in job_init_kwargs.items()
                        })
                    self.set(field, job, job_soup)

            except Exception as err:

                # TODO: we should really dump the soup object to an XML file
                # so that users encountering bugs can submit it and we can
                # quickly fix any failing scraping.

                if field in self.min_required_job_fields:
                    raise ValueError(
                        "Unable to scrape minimum-required job field: "
                        f"{field.name} Got error:{str(err)}. {job.url}"
                    )
                else:
                    # Crash out gracefully so we can continue scraping.
                    self.logger.warning(
                        "Unable to scrape %s for job: %s. %s",
                        field.name.lower(),
                        err,
                        job.url,
                    )

        # Validate job fields if we got something
        if job:
            try:
                job.validate()
            except Exception as err:
                # Bad job scrapes can't take down execution!
                # NOTE: desc too short etc, usually indicates that the job
                # is an empty page. Not sure why this comes up once in awhile...
                self.logger.error("Job failed validation: %s", err)
                return None

        return job
    # pylint: enable=no-member

    @abstractmethod
    def get_job_soups_from_search_result_listings(self) -> List[BeautifulSoup]:
        """Scrapes a job provider's response to a search query where we are
        shown many job listings at once.

        NOTE: the soups list returned by this method should contain enough
        information to set your self.min_required_job_fields with get()

        Returns:
            List[BeautifulSoup]: list of jobs soups we can use to make a Job
        """

    @abstractmethod
    def get(self, parameter: JobField, soup: BeautifulSoup) -> Any:
        """Get a single job attribute from a soup object by JobField

        i.e. if param is JobField.COMPANY --> scrape from soup --> return str
        TODO: better way to handle ret type?
        """

    @abstractmethod
    def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None:
        """Set a single job attribute from a soup object by JobField

        Use this to set Job attribs that rely on Job existing already
        with the required minimum fields.

        i.e. I can set() the Job.RAW to be the soup of it's own dedicated web
        page (Job.URL), then I can set() my Job.DESCRIPTION from the Job.RAW
        """

    def _validate_get_set(self) -> None:
        """Ensure the get/set actions cover all need attribs and dont intersect
        """
        set_job_get_fields = set(self.job_get_fields)
        set_job_set_fields = set(self.job_set_fields)
        all_set_get_fields = set(self.job_get_fields + self.job_set_fields)
        set_min_fields = set(self.min_required_job_fields)

        set_missing_req_fields = set_min_fields - all_set_get_fields
        if set_missing_req_fields:
            raise ValueError(
                f"Scraper: {self.__class__.__name__} Job attributes: "
                f"{set_missing_req_fields} are required and not implemented."
            )

        field_intersection = set_job_get_fields.intersection(set_job_set_fields)
        if field_intersection:
            raise ValueError(
                f"Scraper: {self.__class__.__name__} Job attributes: "
                f"{field_intersection} are implemented by both get() and set()!"
            )
        excluded_fields = []  # type: List[JobField]
        for field in JobField:
            # NOTE: we exclude status, locale, query, provider and scrape date
            # because these are set without needing any scrape data.
            # TODO: SHORT and RAW are not impl. rn. remove this check when impl.
            if (field not in [JobField.STATUS, JobField.LOCALE, JobField.QUERY,
                              JobField.SCRAPE_DATE, JobField.PROVIDER,
                              JobField.SHORT_DESCRIPTION, JobField.RAW]
                    and field not in self.job_get_fields
                    and field not in self.job_set_fields):
                excluded_fields.append(field)
        if excluded_fields:
            # NOTE: INFO level because this is OK, but ideally ppl see this
            # so they are motivated to help and understand why stuff might
            # be missing in the CSV
            self.logger.info(
                "No get() or set() will be done for Job attrs: %s",
                [field.name for field in excluded_fields]
            )
Example #27
0
    q_req, q_res, lock = args
    for i in range(20):
        b = np.random.rand(6, 7) * 2 - 1
        lock.acquire()

        q_req.put(b)
        q_res.get()

        lock.release()
    q_req.put(None)


m = Manager()
q_req = m.Queue(1)
q_res = m.Queue(1)
l = m.Lock()

tasks = 6

net = Process(target=net_worker, args=(
    q_req,
    q_res,
    tasks,
)).start()

from time import sleep
sleep(4)

with Pool(4) as pool:
    pool.map(worker, [(q_req, q_res, l)] * tasks)
"""
Example #28
0
def fill_volume_with_model(
        model_file,
        volume,
        resume_prediction=None,
        checkpoint_filename=None,
        checkpoint_label_interval=20,
        seed_generator='sobel',
        background_label_id=0,
        bias=True,
        move_batch_size=1,
        max_moves=None,
        max_bodies=None,
        num_workers=CONFIG.training.num_gpus,
        worker_prequeue=1,
        filter_seeds_by_mask=True,
        reject_non_seed_components=True,
        reject_early_termination=False,
        remask_interval=None,
        shuffle_seeds=True):
    subvolume = volume.get_subvolume(SubvolumeBounds(start=np.zeros(3, dtype=np.int64), stop=volume.shape))
    # Create an output label volume.
    if resume_prediction is None:
        prediction = np.full_like(subvolume.image, background_label_id, dtype=np.uint64)
        label_id = 0
    else:
        if resume_prediction.shape != subvolume.image.shape:
            raise ValueError('Resume volume prediction is wrong shape.')
        prediction = resume_prediction
        prediction.flags.writeable = True
        label_id = prediction.max()
    # Create a conflict count volume that tracks locations where segmented
    # bodies overlap. For now the first body takes precedence in the
    # predicted labels.
    conflict_count = np.full_like(prediction, 0, dtype=np.uint32)

    def worker(worker_id, set_devices, model_file, image, seeds, results, lock, revoked):
        lock.acquire()
        import tensorflow as tf

        if set_devices:
            # Only make one GPU visible to Tensorflow so that it does not allocate
            # all available memory on all devices.
            # See: https://stackoverflow.com/questions/37893755
            os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
            os.environ['CUDA_VISIBLE_DEVICES'] = str(worker_id)

        with tf.device('/gpu:0'):
            # Late import to avoid Keras import until TF bindings are set.
            from .network import load_model

            logging.debug('Worker %s: loading model', worker_id)
            model = load_model(model_file, CONFIG.network)
        lock.release()

        def is_revoked(test_seed):
            ret = False
            lock.acquire()
            if tuple(test_seed) in revoked:
                ret = True
                revoked.remove(tuple(test_seed))
            lock.release()
            return ret

        while True:
            seed = seeds.get(True)

            if not isinstance(seed, np.ndarray):
                logging.debug('Worker %s: got DONE', worker_id)
                break

            if is_revoked(seed):
                results.put((seed, None))
                continue

            def stopping_callback(region):
                stop = is_revoked(seed)
                if reject_non_seed_components and \
                   region.bias_against_merge and \
                   region.mask[tuple(region.seed_vox)] < 0.5:
                    stop = True
                return stop

            logging.debug('Worker %s: got seed %s', worker_id, np.array_str(seed))

            # Flood-fill and get resulting mask.
            # Allow reading outside the image volume bounds to allow segmentation
            # to fill all the way to the boundary.
            region = Region(image, seed_vox=seed, sparse_mask=True, block_padding='reflect')
            region.bias_against_merge = bias
            early_termination = False
            try:
                six.next(region.fill(
                    model,
                    move_batch_size=move_batch_size,
                    max_moves=max_moves,
                    progress=2 + worker_id,
                    stopping_callback=stopping_callback,
                    remask_interval=remask_interval))
            except Region.EarlyFillTermination:
                early_termination = True
            except StopIteration:
                pass
            if reject_early_termination and early_termination:
                body = None
            else:
                body = region.to_body()
            logging.debug('Worker %s: seed %s filled', worker_id, np.array_str(seed))

            results.put((seed, body))

    # Generate seeds from volume.
    generator = preprocessing.SEED_GENERATORS[seed_generator]
    seeds = generator(subvolume.image, CONFIG.volume.resolution)

    if filter_seeds_by_mask and volume.mask_data is not None:
        seeds = [s for s in seeds if volume.mask_data[tuple(volume.world_coord_to_local(s))]]

    pbar = tqdm(desc='Seed queue', total=len(seeds), miniters=1, smoothing=0.0)
    label_pbar = tqdm(desc='Labeled vox', total=prediction.size, miniters=1, smoothing=0.0, position=1)
    num_seeds = len(seeds)
    if shuffle_seeds:
        random.shuffle(seeds)
    seeds = iter(seeds)

    manager = Manager()
    # Queue of seeds to be picked up by workers.
    seed_queue = manager.Queue()
    # Queue of results from workers.
    results_queue = manager.Queue()
    # Dequeue of seeds that were put in seed_queue but have not yet been
    # combined by the main process.
    dispatched_seeds = deque()
    # Seeds that were placed in seed_queue but subsequently covered by other
    # results before their results have been processed. This allows workers to
    # abort working on these seeds by checking this list.
    revoked_seeds = manager.list()
    # Results that have been received by the main process but have not yet
    # been combined because they were not received in the dispatch order.
    unordered_results = {}

    def queue_next_seed():
        total = 0
        for seed in seeds:
            if prediction[seed[0], seed[1], seed[2]] != background_label_id:
                # This seed has already been filled.
                total += 1
                continue
            dispatched_seeds.append(seed)
            seed_queue.put(seed)

            break

        return total

    for _ in range(min(num_seeds, num_workers * worker_prequeue)):
        processed_seeds = queue_next_seed()
        pbar.update(processed_seeds)

    if 'CUDA_VISIBLE_DEVICES' in os.environ:
        set_devices = False
        num_workers = 1
        logging.warn('Environment variable CUDA_VISIBLE_DEVICES is set, so only one worker can be used.\n'
                     'See https://github.com/aschampion/diluvian/issues/11')
    else:
        set_devices = True

    workers = []
    loading_lock = manager.Lock()
    for worker_id in range(num_workers):
        w = Process(target=worker, args=(worker_id, set_devices, model_file, subvolume.image,
                                         seed_queue, results_queue, loading_lock, revoked_seeds))
        w.start()
        workers.append(w)

    last_checkpoint_label = label_id

    # For each seed, create region, fill, threshold, and merge to output volume.
    while dispatched_seeds:
        processed_seeds = 1
        expected_seed = dispatched_seeds.popleft()
        logging.debug('Expecting seed %s', np.array_str(expected_seed))

        if tuple(expected_seed) in unordered_results:
            logging.debug('Expected seed %s is in old results', np.array_str(expected_seed))
            seed = expected_seed
            body = unordered_results[tuple(seed)]
            del unordered_results[tuple(seed)]

        else:
            seed, body = results_queue.get(True)
            processed_seeds += queue_next_seed()

            while not np.array_equal(seed, expected_seed):
                logging.debug('Seed %s is early, stashing', np.array_str(seed))
                unordered_results[tuple(seed)] = body
                seed, body = results_queue.get(True)
                processed_seeds += queue_next_seed()

        logging.debug('Processing seed at %s', np.array_str(seed))
        pbar.set_description('Seed ' + np.array_str(seed))
        pbar.update(processed_seeds)

        if prediction[seed[0], seed[1], seed[2]] != background_label_id:
            # This seed has already been filled.
            logging.debug('Seed (%s) was filled but has been covered in the meantime.',
                          np.array_str(seed))
            loading_lock.acquire()
            if tuple(seed) in revoked_seeds:
                revoked_seeds.remove(tuple(seed))
            loading_lock.release()
            continue

        if body is None:
            logging.debug('Body was None.')
            continue

        if reject_non_seed_components and not body.is_seed_in_mask():
            logging.debug('Seed (%s) is not in its body.', np.array_str(seed))
            continue

        if reject_non_seed_components:
            mask, bounds = body.get_seeded_component(CONFIG.postprocessing.closing_shape)
        else:
            mask, bounds = body._get_bounded_mask()

        body_size = np.count_nonzero(mask)

        if body_size == 0:
            logging.debug('Body was empty.')
            continue

        # Generate a label ID for this region.
        label_id += 1
        if label_id == background_label_id:
            label_id += 1

        logging.debug('Adding body to prediction label volume.')
        bounds_shape = map(slice, bounds[0], bounds[1])
        prediction_mask = prediction[bounds_shape] == background_label_id
        for seed in dispatched_seeds:
            if np.all(bounds[0] <= seed) and np.all(bounds[1] > seed) and mask[tuple(seed - bounds[0])]:
                loading_lock.acquire()
                if tuple(seed) not in revoked_seeds:
                    revoked_seeds.append(tuple(seed))
                loading_lock.release()
        conflict_count[bounds_shape][np.logical_and(np.logical_not(prediction_mask), mask)] += 1
        label_shape = np.logical_and(prediction_mask, mask)
        prediction[bounds_shape][np.logical_and(prediction_mask, mask)] = label_id

        label_pbar.set_description('Label {}'.format(label_id))
        label_pbar.update(np.count_nonzero(label_shape))
        logging.info('Filled seed (%s) with %s voxels labeled %s.',
                     np.array_str(seed), body_size, label_id)

        if max_bodies and label_id >= max_bodies:
            # Drain the queues.
            while not seed_queue.empty():
                seed_queue.get_nowait()
            break

        if checkpoint_filename is not None and label_id - last_checkpoint_label > checkpoint_label_interval:
            config = HDF5Volume.write_file(
                    checkpoint_filename + '.hdf5',
                    CONFIG.volume.resolution,
                    label_data=prediction)
            config['name'] = 'segmentation checkpoint'
            with open(checkpoint_filename + '.toml', 'wb') as tomlfile:
                tomlfile.write('# Filling model: {}\n'.format(model_file))
                tomlfile.write(str(toml.dumps({'dataset': [config]})))

    for _ in range(num_workers):
        seed_queue.put('DONE')
    for wid, worker in enumerate(workers):
        worker.join()
    manager.shutdown()

    label_pbar.close()
    pbar.close()

    return prediction, conflict_count
 def __init__(self, grid: BaseGrid, manager: Manager):
     self.lock = manager.Lock()
     self.grid = grid
     self.manager_list = manager.list()
     self.manager_list.append(grid)
Example #30
0
		print(handB)
	###########################################################################################
	time.sleep(1)
	print("DISPLAY WILL NOW EXIT")
	time.sleep(1)


key = 1

if __name__ == "__main__":
	
	os.system('clear')
	
	pile_manager = Manager()
	
	pile_mutex = pile_manager.Lock()
	
	action_manager = Manager()

	action_mutex = action_manager.Lock()

	mq = sysv_ipc.MessageQueue(key, sysv_ipc.IPC_CREAT)

	keyboard = kb.KBHit()

	manager = Manager()

	pile = manager.list(create_pile())

	display = multiprocessing.Process(target=update_display, args=())
	board_process = multiprocessing.Process(target=board, args=(pile_mutex,))