Example #1
0
    def send_notification_batch(self, notifications):
        total = []
        for entries_chunk in chunk_list(notifications, 10):
            total.append(encode_notification_for_sqs(entries_chunk))

        for chunk_of_total in chunk_list(total, 10):
            self.client.send_message_batch(
                QueueUrl=self.queue_url,
                Entries=[
                    self.create_notification_message(e) for e in chunk_of_total
                ])
Example #2
0
def start_scraper(url, q_name, parsers, downloaders):
    task_q = qhandler.get_task_q()

    for sections in utils.chunk_list(utils.SECTIONS, parsers):
        webm_q = qhandler.create_channel(queue_name=q_name)
        parser.start_thread(task_q, webm_q, url, sections)

    for _ in range(downloaders):
        channel = qhandler.create_channel(queue_name=q_name)
        downloader.start_thread(channel, q_name)
Example #3
0
def ncbigene_make():
    IDS_FILE = 'gene-subset-ids.txt'
    with open(IDS_FILE, 'rt') as f:  # this came from neuroNER
        ids = [l.split(':')[1].strip() for l in f.readlines()]
    
    #url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&retmax=5000&db=gene&id='
    #for id_ in ids:
        #data = requests.get(url + id_).json()['result'][id_]
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    data = {
        'db':'gene',
        'retmode':'json',
        'retmax':5000,
        'id':None,
    }
    chunks = []
    for i, idset in enumerate(chunk_list(ids, 100)):
        print(i, len(idset))
        data['id'] = ','.join(idset),
        resp = requests.post(url, data=data).json()
        chunks.append(resp)
    
    base = chunks[0]['result']
    uids = base['uids']
    for more in chunks[1:]:
        data = more['result']
        uids.extend(data['uids'])
        base.update(data)
    #base['uids'] = uids  # i mean... its just the keys
    base.pop('uids')
 
    prefixes = {
        'ilx':'http://uri.interlex.org/base/',
        'OBOANN':'http://ontology.neuinfo.org/NIF/Backend/OBO_annotation_properties.owl#',  # FIXME needs to die a swift death
        'NCBIGene':'http://www.ncbi.nlm.nih.gov/gene/',
        'NCBITaxon':'http://purl.obolibrary.org/obo/NCBITaxon_',
    }
    ng = makeGraph('ncbigeneslim', prefixes)

    for k, v in base.items():
        #if k != 'uids':
        ncbi(v, ng)

    ontid = 'http://ontology.neuinfo.org/NIF/ttl/generated/ncbigeneslim.ttl'
    ng.add_node(ontid, rdflib.RDF.type, rdflib.OWL.Ontology)
    ng.add_node(ontid, rdflib.RDFS.label, 'NIF NCBI Gene subset')
    ng.add_node(ontid, rdflib.RDFS.comment, 'This subset is automatically generated from the NCBI Gene database on a subset of terms listed in %s.' % IDS_FILE)
    ng.add_node(ontid, rdflib.OWL.versionInfo, date.isoformat(date.today()))
    ng.write()
Example #4
0
def ncbigene_make():
    IDS_FILE = 'resources/gene-subset-ids.txt'
    with open(IDS_FILE, 'rt') as f:  # this came from neuroNER
        ids = [l.split(':')[1].strip() for l in f.readlines()]

    #url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&retmax=5000&db=gene&id='
    #for id_ in ids:
    #data = requests.get(url + id_).json()['result'][id_]
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    data = {
        'db': 'gene',
        'retmode': 'json',
        'retmax': 5000,
        'id': None,
    }
    chunks = []
    for i, idset in enumerate(chunk_list(ids, 100)):
        print(i, len(idset))
        data['id'] = ','.join(idset),
        resp = requests.post(url, data=data).json()
        chunks.append(resp)

    base = chunks[0]['result']
    uids = base['uids']
    for more in chunks[1:]:
        data = more['result']
        uids.extend(data['uids'])
        base.update(data)
    #base['uids'] = uids  # i mean... its just the keys
    base.pop('uids')

    ng = createOntology(
        'ncbigeneslim',
        'NIF NCBI Gene subset',
        makePrefixes('ILXREPLACE', 'ilx', 'OBOANN', 'NCBIGene', 'NCBITaxon',
                     'skos', 'owl'),
        'ncbigeneslim',
        'This subset is automatically generated from the NCBI Gene database on a subset of terms listed in %s.'
        % IDS_FILE,
        remote_base='http://ontology.neuinfo.org/NIF/')

    for k, v in base.items():
        #if k != 'uids':
        ncbi(v, ng)
    ng.write()
Example #5
0
def send_runcommand(ssm_client, **kwargs):  # noqa
    """Takes in a boto3 session and some kwargs, splits list of instances
    into groups for 50, sends RunCommand, and returns a list of the
    responses.
    """
    doc = 'AWS-RunShellScript'
    response = []
    chunks = chunk_list(kwargs['instances'], 50)  # max 50 instances
    for chunk in chunks:  # iterate over chunks of 50 instances
        response.append(ssm_client.send_command(
            DocumentName=doc,
            InstanceIds=chunk,
            Parameters={  # value must be a list
                'commands': [
                    "#!/bin/bash",
                    'bucket={bucket}'.format(**kwargs),
                    'instance_id=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)',
                    'echo $instance_id $bucket',
                    'if type -t rpm >/dev/null 2>&1;then',
                    ('''    pkg_list=$(rpm -qa --queryformat '"%-30{NAME}": '''
                     '''"%10{VERSION}-%20{RELEASE}",' | sed -e 's~,$~~'  | tr -d ' ')'''),
                    '    echo "{${pkg_list}}" | \\',
                    ('''    python -c 'import json, sys; print(json.dumps('''
                     '''json.loads(sys.stdin.read()), indent=4))' > pkg_list.json'''),
                    '    echo Retrieved package list from rpm',
                    'fi',
                    'if type -t dpkg >/dev/null 2>&1;then',
                    '    echo "Found debian"',
                    'fi',
                    'test -e pkg_list.json || echo unable to find pkg_list.json',
                    'aws s3 cp pkg_list.json s3://$bucket/patching-state/%s/${instance_id}.json' % (
                        kwargs['delta_date']),
                    'echo Completed Export',
                ],
            },
            OutputS3BucketName=kwargs['bucket'],
            OutputS3KeyPrefix='command-output',
            TimeoutSeconds=kwargs['timeout'],
            MaxErrors='10',
        )['Command'])  # appends command return to a list for return

    return response
Example #6
0
    def forward(self,
                inp_spectrum,
                inp_wvlens,
                part_covered=True,
                tol=0.5,
                pad=False,
                ng4=False,
                invert=True,
                snr=True,
                dc=True,
                smear=True,
                run_with_binned=True,
                return_binned=False,
                run_specs=None,
                run_specs_inner=None,
                *args,
                **kwargs):
        """

        :param inp_spectrum: (batches, channels, spectrum), computations are threaded along batches, all spectra in a
                             channel are computed in vectorized form. This means they should not differ in their wavelength
                             support, but they may differ in intensity. If only one batch is supplied, it is broadcast to
                             all supplied inp_wvlens.
        :param inp_wvlens: Support for each batch. If only one inp_wvlens is supplied it is broadcast to all batches.

        :param part_covered:
        :param tol:
        :param args:
        :param kwargs:
        :return:
        """
        binned = run_with_binned

        inp_wvlens = np.atleast_2d(inp_wvlens)
        # reshape input spectrum
        if len(inp_spectrum.shape) == 2:
            # we assume (batch, wvl)
            inp_spectrum = inp_spectrum[:, None, None, ...]
        elif len(inp_spectrum.shape) == 3:
            # we assume (batch, channel, wvl)
            inp_spectrum = inp_spectrum[:, :, None, ...]
        elif len(inp_spectrum.shape) == 4:
            # we assume (batch, channel, pix, wvl)
            pass
        elif len(inp_spectrum.shape) == 5:
            # we assume (batch, channel, band, xtrack, wvl)
            inp_spectrum = inp_spectrum.reshape(inp_spectrum.shape[0],
                                                inp_spectrum.shape[1], -1,
                                                inp_spectrum.shape[-1])
        else:
            raise Exception('Input spectrum has wrong shape.')

        if self.get('res', binned) is not None:
            warnings.warn(
                'WARNING: calculates convolution at different resolutions.')

        if not return_binned and 'unbinned' not in self.params:
            raise Exception('Unbinned params are not available.')

        assert self.check_srfs_initialized(binned=binned)
        assert self.check_inp_spectrum_consistency(inp_spectrum,
                                                   inp_wvlens,
                                                   binned=binned)

        # Determine how many batches per job and prepare run_specs
        if run_specs is None:
            run_specs = {}

        if 'batches_per_job' not in run_specs:
            batches_per_job = 1000
        else:
            batches_per_job = run_specs['batches_per_job']
            run_specs = {
                k: v
                for k, v in run_specs.items() if k != 'batches_per_job'
            }

        if run_specs_inner is None:
            run_specs_inner = dict(joblib=False)

        # broadcast, if only one inp_wvls, assume is same inp_wvls for all inp_spectra in batches
        if len(inp_wvlens) == 1 and len(inp_spectrum) > 1:
            inp_wvlens = [inp_wvlens[0]] * len(inp_spectrum)

        # broadcast, if only one inp_spectrum assume is same for all inp_wvls
        if len(inp_wvlens) > 1 and len(inp_spectrum) == 1:
            inp_spectrum = [inp_spectrum[0]] * len(inp_wvlens)

        # define jobs
        job_inp_spectra = chunk_list(inp_spectrum, batches_per_job)
        job_inp_wvls = chunk_list(inp_wvlens, batches_per_job)
        jobs = [
            partial(self._forward,
                    inp_spectrum=inp_s,
                    inp_wvlens=inp_w,
                    binned=binned,
                    part_covered=part_covered,
                    tol=tol,
                    pad=pad,
                    ng4=ng4,
                    invert=invert,
                    snr=snr,
                    dc=dc,
                    smear=smear,
                    return_binned=return_binned,
                    run_specs=run_specs_inner,
                    *args,
                    **kwargs)
            for inp_s, inp_w in zip(job_inp_spectra, job_inp_wvls)
        ]

        # flatten out job dimension such that we have (batch, channel, band, xdir)
        res, illu_bands = zip(*run_jobs(jobs, **run_specs))
        res = list(itertools.chain(*res))
        illu_bands = list(itertools.chain(*illu_bands))

        return res, illu_bands
Example #7
0
print(properties)

def furl(url):
    url = url.replace('[','-5B')
    url = url.replace(']','-5D')
    url = url.replace('?','-3F')
    url = url.replace('=','%3D')
    return url

url_prefix = 'http://neurolex.org/wiki/Special:Ask/[[Category:Entity]]/'
url_suffix = '/mainlabel=Categories/format=csv/sep=,/offset={}/limit={}'

results = []
result_step = 2500
# see https://www.semantic-mediawiki.org/wiki/Help:Configuration#Query_settings
for props in chunk_list(properties, 10):  # 20 too long :/ may be able to fix via $smwgQMaxSize which defaults to 12
    all_rows = []
    for start in range(0, 30001, result_step):  # offset limit is fixed via $smwgQMaxLimit in SMW_Settings.php
        url = url_prefix + '/?'.join(props) + url_suffix.format(start, result_step)  # crazy stuff when you leave out the ?
        try:
            data = requests.get(furl(url))
        except:
            print('FAILED on URL =', furl(url))
            #embed()
            # data is already defined it will just duplicated the previous block
        reader = csv.reader(data.text.splitlines())
        rows = [r for r in reader]
        all_rows.extend(rows)

    results.append(all_rows)
Example #8
0
def furl(url):
    url = url.replace('[', '-5B')
    url = url.replace(']', '-5D')
    url = url.replace('?', '-3F')
    url = url.replace('=', '%3D')
    return url


url_prefix = 'http://neurolex.org/wiki/Special:Ask/[[Category:Entity]]/'
url_suffix = '/mainlabel=Categories/format=csv/sep=,/offset={}/limit={}'

results = []
result_step = 2500
# see https://www.semantic-mediawiki.org/wiki/Help:Configuration#Query_settings
for props in chunk_list(
        properties, 10
):  # 20 too long :/ may be able to fix via $smwgQMaxSize which defaults to 12
    all_rows = []
    for start in range(
            0, 30001, result_step
    ):  # offset limit is fixed via $smwgQMaxLimit in SMW_Settings.php
        url = url_prefix + '/?'.join(props) + url_suffix.format(
            start, result_step)  # crazy stuff when you leave out the ?
        try:
            data = requests.get(furl(url))
        except:
            print('FAILED on URL =', furl(url))
            #embed()
            # data is already defined it will just duplicated the previous block
        reader = csv.reader(data.text.splitlines())
        rows = [r for r in reader]
Example #9
0
    def run_detection(self,
                      input_path,
                      generate_bbox_images=True,
                      recursive=True,
                      n_cores=0,
                      results=None,
                      checkpoint_path=None,
                      checkpoint_frequency=-1,
                      electron=False):

        image_file_names = find_images(input_path, recursive=recursive)
        print(len(image_file_names))
        #flash(len(image_file_names))

        if results is None:
            results = []

        already_processed = set([i['file'] for i in results])

        gpu_available = True if tf.config.list_physical_devices(
            'GPU') else False

        if n_cores > 1 and gpu_available:
            logging.warning(
                'Multiple cores requested, but a GPU is available; '
                'parallelization across GPUs is not currently '
                'supported, defaulting to one GPU')

        # If we're not using multiprocessing...
        if n_cores <= 1 or gpu_available:
            count = 0  # Does not count those already processed
            # Note: stylising the bar with custom characters breaks in Electron; need to investigate
            print("we're in")
            #flash('innnnn')
            with click.progressbar(length=len(image_file_names),
                                   label='Processing Images',
                                   show_pos=True,
                                   show_eta=True,
                                   show_percent=True,
                                   info_sep='|') as bar:
                for im_file in image_file_names:
                    # Will not add additional entries not in the starter checkpoint
                    if im_file in already_processed:
                        logging.info(
                            f'Bypassing already processed image: {im_file}')
                        continue

                    count += 1

                    result = self.__process_image(im_file,
                                                  generate_bbox_images)
                    results.append(result)
                    bar.update(1)

                    # this is for megadetector-gui usage
                    if electron:
                        print(bar.format_progress_line(), flush=True)

                    # checkpoint
                    if checkpoint_frequency != -1 and count % checkpoint_frequency == 0:
                        logging.info(
                            f'Writing a new checkpoint after having '
                            f'processed {count} images since last restart')
                        with open(checkpoint_path, 'w') as f:
                            json.dump({'images': results}, f)

        else:
            # when using multiprocessing, let the workers load the model
            logging.info(f'Creating pool with {n_cores} cores')

            if len(already_processed) > 0:
                logging.warning(
                    'When using multiprocessing, all images are reprocessed')

            pool = workerpool(n_cores)

            image_batches = list(chunk_list(image_file_names, n_cores))
            results = pool.map(partial(self.__process_images, image_batches),
                               image_batches, generate_bbox_images)
            results = list(itertools.chain.from_iterable(results))

        self.save(results)

        return results