def rate_key_size(key_size, ciphertext):
	dist = 0
	for block_1, block_2 in zip(chunks(ciphertext, key_size), chunks(ciphertext, key_size)[1:]):
		dist += hamming_distance(block_1, block_2)
	dist /=  len(ciphertext) / key_size
	normalized = dist / key_size
	return normalized
Beispiel #2
0
def patient_report(request, patient_key):
    try:
        patient = Patient.objects.get(key=patient_key)
    except:
        raise Http404

    chads2_relative = {
        'sad': round(patient.chads2_risk()['percentage']),
        'happy': 100 - round(patient.chads2_risk()['percentage'])
    }
    faces = ('S' * chads2_relative['sad']) + ('H' * chads2_relative['happy'])
    chads2_relative['rows'] = utils.chunks(faces, 10)


    hasbled_relative = {
        'sad': round(patient.hasbled_risk()['percentage']),
        'happy': 100 - round(patient.hasbled_risk()['percentage'])
    }
    faces = ('S' * hasbled_relative['sad']) + ('H' * hasbled_relative['happy'])
    hasbled_relative['rows'] = utils.chunks(faces, 10)
    context = {
        'patient': patient,
        'chads2_relative': chads2_relative,
        'hasbled_relative': hasbled_relative,
    }

    return render(request, 'data_entry/patients/reports/patient.html', context)
Beispiel #3
0
    def test_forest_classifiers(self):
        """
        Confirm the basic accuracy of our classifiers.
        """
        
        #http://scikit-learn.org/stable/datasets/
        n_estimators=100
        
        # The number of parts the dataset will be split into.
        parts = 10
        
        datasets = [
            ('Iris', load_iris()),
            ('Digits', load_digits()),
        ]
        
        classifiers = [
            (AdaBoostClassifier, partial(AdaBoostClassifier, n_estimators=n_estimators)),
            
            (ExtraTreesClassifier, partial(ExtraTreesClassifier, n_estimators=n_estimators)),
#            
            (DecisionTreeClassifier, DecisionTreeClassifier),
            (StreamingDecisionTreeClassifier, partial(StreamingDecisionTreeClassifier, n_estimators=n_estimators)),
#
            (RandomForestClassifier, partial(RandomForestClassifier, n_estimators=n_estimators)),
            (StreamingRandomForestClassifier, partial(StreamingRandomForestClassifier, n_estimators=n_estimators)),
#            
            (ExtraTreesClassifier, partial(ExtraTreesClassifier, n_estimators=n_estimators)),
            (StreamingExtraTreesClassifier, partial(StreamingExtraTreesClassifier, n_estimators=n_estimators)),
        ]
        
        for name, dataset in datasets:
            print('\nDataset\t%s' % name, len(dataset.data))
            
            # Split our dataset into evenly-sized parts, simulating having
            # to train our classifiers out-of-core on massive datasets.
            # Note, the reference classifiers that don't support partial_fit()
            # will only be trained on each individual chunk.
            parts_n = len(dataset.data)/parts
            data_chunks = list(utils.chunks(dataset.data, parts_n))
            target_chunks = list(utils.chunks(dataset.target, parts_n))
            
            print('Score\tClassifier')
            for cls, cls_callable in classifiers:
                random.seed(0)
                clf = cls_callable()
                for data, target in zip(data_chunks, target_chunks):
#                    print(data)
#                    print(target)
                    assert len(data) == len(target)
                    if hasattr(clf, 'partial_fit'):
                        clf.partial_fit(data, target)
                    else:
                        clf.fit(data, target)
                #scores = cross_val_score(clf, data, target)
                #score = scores.mean()
                score = clf.score(dataset.data, dataset.target)
                print('%.04f\t%s' % (score, cls.__name__))
def copy_csvfile_to_table(f, table_name, delimiter, output_stream, db_params):
    with DBConnection(db_params) as conn:
        cur = conn.cursor()

        # cur.copy_from is shitty() (quotes, error messages, ...)
        # so we have to reinvent the wheel
        # UGLY UGLY UGLY (but works)
        input_file = csv.DictReader(f, delimiter=delimiter)

        processed_rows_counter = 0
        fields_list = []
        all_values = []

        for row in input_file:
            values = ()
            for k, v in row.iteritems():
                if processed_rows_counter == 0:  # Only needed once!
                    fields_list.append(k)

                values = values + (v,)

            all_values.append(values)
            processed_rows_counter += 1

        s = _get_insert_string(table_name, fields_list)

        for vals in chunks(all_values, 500):
            cur.executemany(s, vals)

        #if cur.rowcount != 1:
        #    output_stream.write("ERROR: rowcount is {rowcount} for {query}\n".format(rowcount=cur.rowcount, query=s))

        conn.commit()
        output_stream.write("{i} processed rows ".format(i=processed_rows_counter))
Beispiel #5
0
 def new_scrobble(self, **kwargs):
     """
     keyword arguments are the exact same as Scrobble.__init__
     It makes the new scrobble object, then tries to send all scrobbles
     that need sending.
     """
     kwargs['session'] = self
     kwargs['sent'] = False
     new_scrobble = Scrobble(**kwargs)
     
     scrobbles = self.get_failed_scrobbles()
     
     try:
         for ss in chunks(scrobbles, chunksize=50):
             # Send previously failed scrobbles to lastfm in chunks of 50
             ss = ScrobbleSet(scrobbles)
             ss.try_to_send()
             
     except LastFMError:
         # there were old scrobbles that needed to be sent first, and some of them
         # failed. Queue this one up, and don't bother sending any more.
         # (lastfm is most likely down)
         new_scrobble.send = False
         new_scrobble.save()
         return False
     else:    
         # Either there were no old scrobbles to send first, or they
         # all were sent sucessfully! send the new scrobble now.
         new_scrobble.timestamp = int(time.time())
         new_scrobble.send()
         return True
Beispiel #6
0
def get_attributes(args):
    """
    Gather all data necessary for metrics calculations
    """
    # Get publication information
    if 'query' in args:
        # If we were fed a query, gather the associated bibcodes
        bibcodes = get_publications_from_query(args['query'])
    elif 'bibcodes' in args:
        bibcodes = map(lambda a: a.strip(), args['bibcodes'])
    elif 'libid' in args:
        # In theory we allow for retrieving bibcodes from private libraries
        # Clearly this will currently not be used
        bibcodes = get_bibcodes_from_private_library(args['libid'])
    # Split the list of bibcodes up in chunks, for parallel processing
    biblists = list(chunks(bibcodes,config.METRICS_CHUNK_SIZE))
    # Now gather all usage data numbers from the MongoDB 'adsdata' collection,
    # keyed on bibcode
    ads_data = get_mongo_data(bibcodes=bibcodes)
    missing_bibcodes = filter(lambda a: a not in ads_data.keys(), bibcodes)
    app.logger.error("Bibcodes found with missing metadata: %s" % ",".join(missing_bibcodes))
    bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes)
    # Get precomputed and citation data
    metrics_data = get_metrics_data(bibcodes=bibcodes)
    # Get the number of citing papers
    Nciting = len(list(set(itertools.chain(*map(lambda a: a['citations'], metrics_data.values())))))
    Nciting_ref = len(list(set(itertools.chain(*map(lambda a: a['refereed_citations'], metrics_data.values())))))
    # The attribute vectors will be used to calculate the metrics
    attr_list = make_vectors(bibcodes,ads_data,metrics_data)
    # We sort the entries in the attribute list on citation count, which
    # will make e.g. the calculation of 'h' trivial
    attr_list = sort_list_of_lists(attr_list,2)

    return attr_list,Nciting,Nciting_ref
Beispiel #7
0
def play_sound(fname):
    global cfg_dict
    cfg_dict = utils.read_config()

    audio = pyaudio.PyAudio()
    stream_audio = audio.open(format=pyaudio.paInt16,
                              channels=2,
                              rate=cfg_dict['rate'],
                              output=True,
                              frames_per_buffer=cfg_dict['rate'])

    f = open(fname)
    data = list(utils.chunks(f.read(), cfg_dict['rate']))
    f.close()

    print ' * play'
    start = time.time()

    for i in range(len(data)):
        stream_audio.write(data[i])

    print ' * play end |',

    stream_audio.stop_stream()
    stream_audio.close()
    audio.terminate()

    print 'elapsed time =', time.time() - start
Beispiel #8
0
    def _decodePeersInfo(self, str):
        """
        Decode all peers.
        """
        peers = utils.chunks(str, 7)
        peers = map(self._decodePeerInfo, peers)

        return list(peers)
def insert_many(table_name, fields_list, all_values, db_params):
    with DBConnection(db_params) as conn:
        cur = conn.cursor()
        
        s = _get_insert_string(table_name, fields_list)

        for vals in chunks(all_values, 500):
            cur.executemany(s, vals)

        conn.commit()
Beispiel #10
0
    def stop(self):
        # Handle the saved bits
        for chunk in chunks(self.extra_bits, 8):
            if len(chunk) == 8:
                byte = sum([ x << i for i, x in enumerate(chunk) ])

                for x in self.chain:
                    x.put(byte)

        # Continue
        return super(VonNeumannExtractor, self).stop()
Beispiel #11
0
 def _populate_plankton(self):
     print 'Populating Planktons...'
     portions = utils.chunks(range(self.wally.shape[0]),
                             self.wally.shape[0]/len(self.qm.keys()))
     for p in self.qm.keys():
         r = portions.next()
         self.qm[p]['range'] = (r[0], r[-1])
         self.qm[p]['q_in'].put([self.capacity, self.mutation,r[0],
                                 self.wally[r[0]:r[-1]+1],
                                 self.meteo[:,r[0]:r[-1]+1],
                                 self.nada])
         print '\tPLANKTON QM - %i \trange: %i - %i' % (p, r[0], r[-1])
Beispiel #12
0
def admin_clearAllBuildData(req):
    all = getAllFromFromQuery(datamodel.DB_FileBuild.all(keys_only=True))

    for x in chunks(all, 1000):
        db.delete(x)

    nextVersionNum = datamodel.DB_JV_AtomicCounter.GetNextCounter(_fileVerKey)

    from inmemconfig import InAppMemConfig
    InAppMemConfig.ForceVersionIncrement()

    return RetType.JSONSUCCESS
Beispiel #13
0
 def correct(self, word):
     global WORD
     WORD = word
     p = Pool(self.n_jobs)
     chunk_size = int(len(self.forms) / self.n_jobs)
     # form, max_prob = get_most_probable_from_chunk(self.forms)
     arguments = chunks(self.forms, chunk_size)
     results = p.map(get_most_probable_from_chunk, arguments)
     p.close()
     p.join()
     form, max_prob = max(results, key=operator.itemgetter(1))
     return form
    def put(self, filename):
        file_uuid = uuid4().hex

        file = File(name=str(urllib.unquote(filename)),
             content_type=self.request.headers.get('Content-Type', None),
             key_name=file_uuid)
        file.put()

        for chunk in chunks(self.request.body, config.max_fragment_size):
            Fragment(file=file, data=chunk).put()

        self.response.set_status(201)
        self.response.out.write(file_uuid)
Beispiel #15
0
 def dump(self):
     result = ""
     result += "Start: " + self.exploit.pointer_format % self.start + " (" + self.exploit.closest_section_from_address(self.start) + ")\n"
     result += "Size: " + self.exploit.pointer_format % self.size + " (" + str(self.size) + ")\n"
     result += "End: " + self.exploit.pointer_format % self.end + "\n"
     result += "Base: " + self.exploit.pointer_format % self.align_to + "\n"
     result += "Alignment: " + str(self.alignment) + "\n"
     result += "Index: " + hex(self.index) + " (" + str(self.index) + ")\n"
     result += "Wasted: " + str(self.wasted) + "\n"
     result += "Content:\n"
     for chunk in chunks(self.content, self.exploit.pointer_size):
         result += " " * 4 + " ".join(["%.2x" % ord(c) for c in chunk]) + " " + (self.exploit.pointer_format % self.exploit.str2ptr(chunk) if len(chunk) == self.exploit.pointer_size else "") + "\n"
     return result
Beispiel #16
0
def sites_update():
    master_list = "http://www.metoffice.gov.uk/public/data/PWSCache/Locations/MasterList?format=application/json"
#    master_list = "http://localhost/~gareth/MasterList.json"

    result = urlfetch.fetch(master_list)

    if result.status_code == 200:
        obs_sites = filter(lambda loc: loc["type"] == "Observing Site", parse_locations(result.content))
        for chunk in chunks(obs_sites,10):
            taskqueue.add(url="/admin/sites/store", params = {"sites":json.dumps(chunk)})
        flash("Started load of %d sites" % len(obs_sites))
    else:
        flash("Error fetching MasterList: [%d] - %s" % (result.status_code, result.status_message))

    return redirect(url_for('index'))
def save_states(q, gpu, target, limit, mem_ratio, model_dir, seed=0, chunksize=1000):
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu
    print 'GPU {}'.format(gpu)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_ratio)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        
        load_graph(os.path.join(model_dir, 'classify_image_graph_def.pb'))
        next_last_layer = sess.graph.get_tensor_by_name('pool_3:0')
        
        while True:
            source = q.get()
            if source == KILL:
                break

            images = glob.glob('{}/*'.format(source))
            random.seed(seed)
            random.shuffle(images)
            if limit > 0:
                images = images[:limit]

            t0 = time.time()
            h5name = os.path.join(target, '{}.h5'.format(os.path.basename(os.path.normpath(source))))

            with pd.HDFStore(h5name, mode='w', complevel=9, complib='blosc') as store:
                for chunk in chunks(images, chunksize):

                    states = []
                    for jpg in list(chunk): # Creates a copy over which it is safe to iterate
                        try:
                            raw_data = gfile.FastGFile(jpg).read()
                            hidden_layer = sess.run(next_last_layer,
                                                    {'DecodeJpeg/contents:0': raw_data})
                            hidden_layer = np.squeeze(hidden_layer)
                            states.append(hidden_layer)

                        except Exception as e:
                            chunk.remove(jpg)
                            print 'Something went wrong when processing {}'.format(jpg)

                    X = np.vstack(states)
                    columns = [ 'f{}'.format(i) for i in range(X.shape[1]) ]
                    
                    df = pd.DataFrame(data=X, index=chunk, columns=columns)
                    df.index.name='filename'
                    store.append('data', df)

            print('Time spent collecting {} states: {}'.format(len(images), time.time() - t0))
    def imported(self, date_str, **kwargs):
        start_id = kwargs.get('start', 1)
        end_id = kwargs.get('end', -1)
        ids = self.database_service.load_ids(self.market, start_id, end_id)
        for batch_app_ids in chunks(ids, DEFAULT_BATCH_SIZE):
            print 'Started to import batch:', len(batch_app_ids)
            logger.info('Started to import batch: {}'.format(len(batch_app_ids)))
            for app_id in batch_app_ids:
                content = self._load(date_str, app_id)
                detail_dict = self._parser(content)
                self._save(app_id, detail_dict)

            garbage_number = gc.collect()
            print 'Garbage number:', garbage_number

        self.database_service.close()
Beispiel #19
0
 def __init__(self):
     self.cells = []
     for _ in xrange(4):
         self.cells.append(Cell())
     self.foundations = []
     for _ in xrange(4):
         self.foundations.append(Foundation())
     self.cascades = []
     for _ in xrange(8):
         self.cascades.append(Cascade())
     # TODO fill cascades with cards
     deck = Deck()
     deck.shuffle()
     for chunk in chunks(deck, 8):
         for cascade, card in izip(self.cascades, chunk):
             cascade.append(card)
 def write_flag_export(key_list):
     connection = connect_db(ORACONN)
     cursor = connection.cursor()
     date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     try:
         for k in list(chunks(key_list, 1000)):
             cursor.execute(
                 "UPDATE EXP_LOG_KEYS_UNIFARM SET FLAG_EXPORT=0, UPDATE_DATE_TIME='%s' WHERE SYNC_KEY IN %s" % (
                     date, k))
             logger.info('___Update flag for synckeys {0} ok!!!___'.format(k))
             connection.commit()
         ret_val = True
     except(Exception, cx_Oracle.DatabaseError):
         logger.error("___Something was wrong during the write on the DB___")
         ret_val = False
     cursor.close()
     connection.close()
     return ret_val
Beispiel #21
0
def process_elm_simple(options):
	"""Determines (and writes) the ELM dictionary"""
	
	c_arg = ''
	if options.process_elm_simple.get('picloud', False): c_arg = '-c'
	
	for genome in ('H_sapiens', 'Gallus_gallus'):
		ofile = os.path.join('working', 'Jul22', 
                                     'elmdict_'+genome+'.simple')
		ifile = os.path.join('working', 'Jul22', genome+'.fa')
                st_elm_file = os.path.join('working', 'Jul22', 
                                           'simple_patterns')
                elms = {}
                with open(st_elm_file) as f:
                    for line in f:
                        elm, pattern = line.strip().split('\t')
                        elms[elm] = pattern
                elm_files = []
                size = 1000
                if len(elms) > size:
                    counter = 0
                    for chunk in utils.chunks(elms.keys(), size):
                        new_elm_file = 'working/elm_tmp_file' + str(counter)
                        elm_files.append(new_elm_file)
                        with open(new_elm_file, 'w') as f:
                            for elm in chunk:
                                f.write(elm + '\t' + elms[elm] + '\n')
                        counter += 1
                else:
                    elm_files.append(st_elm_file)
                
                if not os.path.exists(ofile) or options.process_elm_simple.get('forcenew', False):
                    counter = 0
                    for elmfile in elm_files:
                        #only do if missing or FORCING
                        sh('python makeELMdict.py %(c)s -o %(out)s %(infile)s %(elm)s' % {'out':ofile, 
                                                                                          'c':c_arg, 'infile': ifile,
                                                                                          'elm': elmfile})
                        sh('mv ' + ofile + ' ' + ofile + str(counter))
                        counter += 1
Beispiel #22
0
    def encode(self, s, block=None):
        """ Encode a message.

        Parameters
        ----------
        s : str
            A message to encode.
        block : int, optional
            Divide output into blocks of this size.  All non-transcodable
            symbols will be stripped.  Specify the value `0` to strip all
            non-transcodable symbols and not divide into blocks.
            Specify the value `None` to disable chunking.  Default `None`.

        Returns
        -------
        out : str
            The encoded message.

        Notes
        -----
        Although this can invoke either `self._encode` or `super().encode`, it
        essentially falls prey to the "call super" antipattern and should
        probably be refactored. [TODO]

        """
        if block is not None:
            # filter message to characters in ciphertext alphabet
            s = intersect(s, self.alphabet)

            if block > 0:
                padding = upward_factor(block, len(s))
                s = s.ljust(padding, self.DEFAULT_NULLCHAR)

        out = super().encode(s)

        if block is not None and block > 0:
            out = ' '.join(chunks(out, block))

        return ''.join(out)
Beispiel #23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("indir", help="path to directory of input files", type=str)
    parser.add_argument("outdir", help="path to directory of database files", type=str)
    parser.add_argument("-t", "--title", help="the base of the title for the blastdb", type=str, default="blastdb")
    parser.add_argument("-p", "--partitions", help="number of files to partition database into", type=int, default=1)
    parser.add_argument("--minpar", help="use the partition size", type=bool)
    args = parser.parse_args()

    blastpath = None
    if 'BLASTPATH' in os.environ:
        blastpath = os.environ['BLASTPATH']
    if not validdir(blastpath, "Invalid $BLASTPATH"):
        return 1
    if not validdir(args.indir, "Input directory does not exist"):
        return 1

    infiles = glob.glob(os.path.join(args.indir, INFORMAT))
    if len(infiles) == 0:
        sys.stderr.write("No valid input files")
        return 1

    num_partitions = args.partitions
    if args.minpar:
        num_partitions = len(infiles)

    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    processes = []
    partitions = list(chunks(infiles, len(infiles)/num_partitions))
    for i, partition in zip(range(len(partitions)), partitions):
        processes.append(createdb(blastpath, i, partition, args.outdir, args.title))

    for proc in processes:
        out, err = proc.communicate()
        print out

    return 0
Beispiel #24
0
def get_attributes(args):
    """
    Gather all data necessary for metrics calculations
    """
    # Get publication information
    if 'query' in args:
        # If we were fed a query, gather the associated bibcodes
        bibcodes = get_publications_from_query(args['query'])
    elif 'bibcodes' in args:
        bibcodes = map(lambda a: a.strip(), args['bibcodes'])
    elif 'libid' in args:
        # In theory we allow for retrieving bibcodes from private libraries
        # Clearly this will currently not be used
        bibcodes = get_bibcodes_from_private_library(args['libid'])
    # Split the list of bibcodes up in chunks, for parallel processing
    biblists = list(chunks(bibcodes,config.METRICS_CHUNK_SIZE))
    # Get precomputed metrics data, key-ed on bibcode
    metrics_data = get_metrics_data(bibcodes=bibcodes)
    missing_bibcodes = filter(lambda a: a not in metrics_data.keys(), bibcodes)
    if len(missing_bibcodes) > 0:
        app.logger.error("Bibcodes found with missing metrics data: %s" % ",".join(missing_bibcodes))
    bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes)
    bibcodes_without_authnums = map(lambda b: b['_id'],filter(lambda a: a['author_num'] == 0, metrics_data.values()))
    if len(bibcodes_without_authnums):
        app.logger.error("Bibcodes found with author number equal to zero: %s" % ",".join(bibcodes_without_authnums))
    bibcodes = filter(lambda a: a not in bibcodes_without_authnums, bibcodes)
    # Get the number of citing papers
    Nciting = len(list(set(itertools.chain(*map(lambda a: a['citations'], metrics_data.values())))))
    # Nciting_ref refers to citation to the refereed papers in the set
    Nciting_ref = len(list(set(itertools.chain(*map(lambda b: b['citations'], filter(lambda a: a['refereed']==True,metrics_data.values()))))))
    # The attribute vectors will be used to calculate the metrics
    attr_list = make_vectors(bibcodes,metrics_data)
    # We sort the entries in the attribute list on citation count, which
    # will make e.g. the calculation of 'h' trivial
    attr_list = sort_list_of_lists(attr_list,2)

    return attr_list,Nciting,Nciting_ref
Beispiel #25
0
def _cochlear_trim_sai_marginals(filename_and_indexes):
    try:
        filename, norm_segstart, norm_segend, audio_id, NAP_detail = filename_and_indexes
        sai_video_filename = '{}_sai_video_{}'.format(filename, NAP_detail)
        if os.path.isfile('{}.npy'.format(sai_video_filename)):
          return sai_video_filename

        if NAP_detail == 'high':
            try: 
                NAP = utils.csv_to_array(filename+'cochlear'+NAP_detail)
            except:
                NAP = brain.cochlear(filename, stride=1, rate=44100, apply_filter=0, suffix='cochlear'+NAP_detail)
        if NAP_detail == 'low':
            try: 
                NAP = utils.csv_to_array(filename+'cochlear'+NAP_detail)
            except: 
                NAP = brain.cochlear(filename, stride=IO.NAP_STRIDE, rate=IO.NAP_RATE, apply_filter=0, suffix='cochlear'+NAP_detail) # Seems to work best, in particular when they are all the same.

        num_channels = NAP.shape[1]
        input_segment_width = 2048
        sai_params = CreateSAIParams(num_channels=num_channels,
                                     input_segment_width=input_segment_width,
                                     trigger_window_width=input_segment_width,
                                     sai_width=1024)

        sai = pysai.SAI(sai_params)

        NAP = utils.trim_right(NAP[ np.int(np.rint(NAP.shape[0]*norm_segstart)) : np.int(np.rint(NAP.shape[0]*norm_segend)) ], threshold=.05)
        sai_video = [ np.copy(sai.RunSegment(input_segment.T)) for input_segment in utils.chunks(NAP, input_segment_width) ]
        del NAP        
        np.save(sai_video_filename, np.array([ sai_rectangles(frame) for frame in sai_video ]))
        return sai_video_filename

    except:
        print utils.print_exception('Calculation SAI video failed for file {}, NAP detail {}'.format(filename, NAP_detail))
        return False
    def process(self, process_id, date_str, app_ids):
        """
        print self.error_proxy_dict

        :param date_str:
        :param app_ids:
        :return:
        """
        print 'Started process, need to scrape {}'.format(len(app_ids))
        logger.info('Started process, need to scrape {}'.format(len(app_ids)))
        for batch_app_ids in chunks(app_ids, DEFAULT_BATCH_SIZE):
            for app_id in batch_app_ids:
                app_detail_key = DETAIL_SOURCE_KEY.format(date=date_str, market=self.market, app_id=app_id)
                if self.redis_service.exists(app_detail_key):
                    continue
                content = self._scrape(app_id)
                if content:
                    self._save(app_detail_key, content)

            garbage_number = gc.collect()
            print 'Garbage number:', garbage_number

        print 'Succeed process {}'.format(process_id)
        logger.info('Succeed process {}'.format(process_id))
 def move_to_destination(cls, response, destination):
     # destination is a pymongo MongoDB object
     for collection, items in response.items():
         print('writing collection:', collection, len(items))
         for chunk in utils.chunks(50, items):
             destination[collection].insert_many(chunk)
Beispiel #28
0
def bt_nodes_info_from_raw_data(data):
    return [bt_contact_node(x) for x in chunks(data, 26)]
 def test_chunks(self):
     '''Test if the chunks method behaves properly'''
     list = ['a', 'b', 'c', 'd']
     expected = [['a'], ['b'], ['c'], ['d']]
     self.assertEqual([x for x in chunks(list, 1)], expected)
Beispiel #30
0
def get_codons(seq):
	seq = seq.upper()
	codons = ut.chunks(seq,3)
	return codons
Beispiel #31
0
    def from_obspy(cls, stream, params=None):
        try:
            import obspy
        except ModuleNotFoundError:
            print("Install obspy with `conda install -c conda-forge obspy`.")

        data = np.stack(t.data for t in stream.traces)
        if params is None:
            params = {}
        dt = params.get('dt', stream.binary_file_header.sample_interval_in_microseconds)

        # ndim param can force 2d or 3d data
        ndim = params.get('ndim', 0)
        if ndim:
            params.pop('ndim')

        # Make certain it winds up in seconds. Most likely 0.0005 to 0.008.
        while dt > 0.02:
            dt *= 0.001

        params['dt'] = dt

        # Since we have the headers, we can try to guess the geometry.
        threed = False

        # Get the sawtooth header field. In a perfect workd, only works for 3D.
        xlines = utils.get_pattern_from_stream(stream, patterns.sawtooth)
        if np.any(xlines) and (ndim != 2):
            threed = True
            nxlines = np.amax(xlines) - np.amin(xlines) + 1
            params['nxlines'] = params.get('nxlines') or nxlines
            params['xlines'] = params.get('xlines') or xlines
            params['dimensions'] = ['i', 'x', 't']
        else:
            xlines = utils.get_pattern_from_stream(stream, patterns.monotonic)
            if np.any(xlines):
                nxlines = np.amax(xlines) - np.amin(xlines) + 1
                params['nxlines'] = params.get('nxlines') or nxlines
                params['xlines'] = params.get('xlines') or xlines
            params['dimensions'] = ['i', 't']

        params['ninlines'] = 1
        if threed:
            inlines = utils.get_pattern_from_stream(stream, patterns.stairstep)
            if np.any(inlines):
                ninlines = np.amax(inlines) - np.amin(inlines) + 1
                params['ninlines'] = params.get('ninlines') or ninlines
                params['inlines'] = params.get('inlines') or inlines

        header = np.array(list(stream.textual_file_header.decode()))
        params['header'] = '\n'.join(c for c in utils.chunks(header, 80))

        headers = {
            'elevation': 'receiver_group_elevation',
            'fold': 'number_of_horizontally_stacked_traces_yielding_this_trace',
            'water_depth': 'water_depth_at_group',
        }

        for k, v in headers.items():
            params[k] = [t.header.__dict__[v] for t in stream.traces]

        return cls(data, params=params)
Beispiel #32
0
        path_output.mkdir()

        for folder in local_folders:
            shutil.rmtree(folder, ignore_errors=True)

        #################### start of single url download ####################
        args.url_file = url  # download() depends on global level arg variable
        month = extract_month(args.url_file)
        # in case we are resuming from a previous run
        completed_uids, state_fp, prev_cid = get_state(month, args.output_dir)
        # URLs we haven't scraped yet (if first run, all URLs in file)
        url_entries = load_urls(args.url_file, completed_uids, args.max_urls)
        pool = mpl.Pool(args.n_procs)

        # process one "chunk" of args.chunk_size URLs at a time
        for i, chunk in enumerate(chunks(url_entries, args.chunk_size)):
            cid = prev_cid + i + 1

            print("Downloading chunk {}".format(cid))
            t1 = time.time()

            if args.timeout > 0:
                # imap as iterator allows .next() w/ timeout.
                # ordered version doesn't seem to work correctly.
                # for some reason, you CANNOT track j or chunk[j] in the loop,
                # so don't add anything else to the loop below!
                # confusingly, chunksize below is unrelated to our chunk_size
                chunk_iter = pool.imap_unordered(download, chunk, chunksize=1)
                cdata = []
                for j in range(len(chunk)):
                    try:
Beispiel #33
0
    def __call__(self, base, shape, nnear=None, majority=True, pickle_name=None):
        """
        For each query point in the base array, find the K nearest
        neighbors and calculate either the majority value or the
        inverse-weighted value for those neighbors.

        Keyword arguments:
        base -- output array (x, y)
        nnear -- number of neighbors to check
        majority -- boolean: whether to use the majority algorithm
        pickle -- boolean: save variables for pickling

        """
        # Set nearest neighbors to default value of 11 if not set.
        if nnear is None:
            nnear = 11

        if self.canCL and self.wantCL:
            # These values do not change from run to run.
            values_buf = cla.to_device(self.queue, self.values)
            tree_buf = cla.to_device(self.queue, self.tree)
            coords_buf = cla.to_device(self.queue, self.coords)
            lentree_arg = np.uint32(len(self.tree))
            nnear_arg = np.uint32(nnear)
            usemajority_arg = np.uint32(1 if majority else 0)
            # Calculate how many base elements can be evaluated per run.
            static_data = self.values.nbytes + self.tree.nbytes + self.coords.nbytes + lentree_arg.nbytes + nnear_arg.nbytes + usemajority_arg.nbytes
            # Each base element is two float32s (8 bytes).
            bpe_single = 2*4
            # Each retval is one int32 (4 bytes).
            bpe_total = bpe_single + 4
            # Check both single and total limits for elems-per-slice.
            eps_single = [int(0.95*device.max_mem_alloc_size/bpe_single) for device in self.devices]
            eps_total = [int((0.95*device.global_mem_size-static_data)/bpe_total) for device in self.devices]
            elem_limits = [min(eps_single[x], eps_total[x]) for x in xrange(len(self.devices))]
            # For now, at least, do not create retval or chunk buffer here.
            results = []
            # NB: Only supporting one device for now.
            best_device = np.argmax(elem_limits)
            global_size = self.global_size[self.devices[best_device]]
            local_size = self.local_size[self.devices[best_device]]
            for chunk in chunks(base, elem_limits[best_device]):
                # Create retvals and chunk buffer here instead of above.
                lenchunk = len(chunk)
                retvals_arr = np.empty(lenchunk, dtype=np.int32)
                retvals_buf = cla.to_device(self.queue, retvals_arr)
                chunk_buf = cla.to_device(self.queue, chunk)
                lenchunk_arg = np.uint32(lenchunk)
                event = self.program.idt(self.queue, global_size, local_size, retvals_buf.data, values_buf.data, tree_buf.data, coords_buf.data, lentree_arg, chunk_buf.data, lenchunk_arg, nnear_arg, usemajority_arg)
                event.wait()
                # Copy retvals_buf to results.
                retvals_arr = retvals_buf.get()
                if results == []:
                    results = retvals_arr.tolist()
                else:
                    results += retvals_arr.tolist()
        else:
            # from invdisttree.py
            distances, indexes = self.tree.query(base, k=nnear)
            results = np.zeros((len(distances),) + np.shape(self.values[0]))
            jinterpol = 0
            for distance, index in zip(distances, indexes):
                if nnear == 1:
                    wz = self.values[index]
                elif distance[0] < 1e-10:
                    wz = self.values[index[0]]
                else:
                    w = 1/distance
                    w /= np.sum(w)
                    if majority:
                        majordict = dict([(x, 0) for x in self.values[index]])
                        for zval, wval in zip(self.values[index], w):
                            majordict[zval] += wval
                        wz = max(majordict, key=majordict.get)
                    else:
                        wz = np.dot(w, self.values[index])
                results[jinterpol] = wz
                jinterpol += 1
        if pickle_name is not None:
            # Pickle variables for testing purposes.
            picklefilename = 'idt-%s-%d.pkl.gz' % (pickle_name, (1 if majority else 0))
            print 'Pickling to %s...' % picklefilename
            f = gzip.open(picklefilename, 'wb')
            pickle.dump(self.coords, f, -1)
            pickle.dump(self.values, f, -1)
            pickle.dump(base, f, -1)
            pickle.dump(shape, f, -1)
            pickle.dump(nnear, f, -1)
            pickle.dump(majority, f, -1)
            # pickle.dump(results, f, -1)
        return np.asarray(results, dtype=np.uint32).reshape(shape)
def train_network( features, labels, alpha, dump_location ):

    features_1 = features
    labels_1 = labels
    features_2 = features.copy()
    labels_2 = labels.copy()


    model = confidence_network()
    model.train()
    optimizer = torch.optim.Adam(  model.parameters() ,
                            lr = 0.0001
                            )
    criterion = torch.nn.CrossEntropyLoss()
    
 
    for epoch in range( 10 ):
        index_1 = np.arange(len(features_1))
        random.shuffle(index_1)
        index_2 = np.arange(len(features_2))
        random.shuffle(index_2)

        features_1 = features_1[index_1]
        labels_1 = labels_1[index_1]

        features_2 = features_2[index_2]
        labels_2 = labels_2[index_2]

        total = 0
        correct_pred = 0
        for step, ( f1, l1, f2, l2 ) in enumerate( zip(chunks(features_1),chunks(labels_1),
                                                            chunks(features_2), chunks(labels_2)) ):

            try:
                
                f1 = make_tensor(f1)
                f2 = make_tensor(f2)
                # norm_features = alpha * f1 + (1-alpha)*f2

                # norm_features = make_tensor(norm_features)

                labels = make_tensor(np.vstack([l1,l2]),dtype=torch.long)



                optimizer.zero_grad()
                
                preds_1 = model(f1,f2,alpha) 
                loss_1 = loss_norm( preds_1, labels, alpha, step )
                
                f_s_1, f_s_2, l_s = same_class_sample()
                f_s_1, f_s_2 = make_tensor(f_s_1), make_tensor(f_s_2)
                preds_2 = model(f_s_1,f_s_2,alpha)
                l_s = make_tensor(l_s, dtype=torch.long)
                loss_2 = criterion(preds_2,l_s)

                if(epoch <2):
                    loss = loss_1
                else:
                    loss = loss_1 + loss_2/10
                loss.backward()
                optimizer.step()

                _, predicted = torch.max(preds_2.data,1)
                total += l_s.size(0)
                correct_pred += (predicted == l_s).sum().item()
            
            except ValueError:
                pass

            if(step % 200 == 0):
                print("epoch {}, step {}, loss_1 {:.4f}, loss_2 {:.4f}".format(epoch,step, loss.data.item(), 0))
        print("++++++++++++++++")
        print("accuracy after {} epochs is {}".format(epoch,correct_pred/total))

    torch.save(model,dump_location)

    return model
Beispiel #35
0
if __name__ == "__main__":
    random.seed(config.SEED)

    if not os.path.isdir(config.DATA_AR_FOLDER):
        print('ERR: {} does not exist'.format(config.DATA_AR_FOLDER))

    cik_folders = [
        os.path.join(config.DATA_AR_FOLDER, d)
        for d in os.listdir(config.DATA_AR_FOLDER)
        if os.path.isdir(os.path.join(config.DATA_AR_FOLDER, d))
    ]
    random.shuffle(cik_folders)  # Better separate work load

    if config.MULTITHREADING:
        folders = utils.chunks(cik_folders,
                               1 + int(len(cik_folders) / config.NUM_CORES))
        procs = []
        for i in range(config.NUM_CORES):
            procs.append(
                Process(target=process_folder_multithread,
                        args=(folders[i], )))
            procs[-1].start()

        for p in procs:
            p.join()
    else:
        connection = utils.create_mysql_connection()

        for folder in tqdm.tqdm(cik_folders,
                                desc="Extract data from annual reports"):
            process_folder(folder, connection)
Beispiel #36
0
    return training_data, len(words)


with open("data.txt") as f:
    content = f.read()

window = 6
time_steps = window - 1
num_hidden = 512
num_input = 1
batch_size = 100
iteration = 250

training_data, num_classes = data_sampling(content, window=window)
# Build the Batches:
batches = chunks(training_data, batch_size)

# RNN output node weights and biases
weights = {'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))}
biases = {'out': tf.Variable(tf.random_normal([num_classes]))}

# tf graph input
X = tf.placeholder("float", [None, time_steps, num_input], name='X')
Y = tf.placeholder("float", [None, num_classes])


def RNN(x, weights, biases):

    # Unstack to get a list of 'timesteps' tensors, each tensor has shape (batch_size, n_input)
    x = tf.unstack(x, time_steps, 1)
Beispiel #37
0
                                ])

    args = parser.parse_args()
    curr_batch_order = args.batch_idx

    d = Dataset(dataset=args.dataset, dataset_type=args.dataset_type, path=args.path)
    pairs = d.load_dataset()

    # #Create PCA if not exists
    filename = os.path.join(args.path, 'pca', 'pca_%s_%s_%s.pkl' % (args.dataset, args.dataset_type, args.emb))
    if not os.path.isfile(filename):
        print('PCA file make!')
        pca_save(pairs, args.emb, args.bert_encode_type, args.encoder_path, filename, args.pca_components)

    if args.batch:
        batches = chunks(pairs,args.batch_size)
        for i in range(curr_batch_order+1):
            batch = next(batches)

    else:
        batch = pairs

    print("batch from %d to %d" %(curr_batch_order*args.batch_size, curr_batch_order*args.batch_size+args.batch_size))

    #Load (or Save) Encoder File
    encoder_file = os.path.join(args.path,'bert','{}.{}.{}.{}.pkl'.format
                    (args.dataset, args.dataset_type, args.emb, args.batch_idx))

    if not os.path.isfile(encoder_file):
        e = Encoder(path=args.encoder_path, emb=args.emb, bert_encode_type=args.bert_encode_type)
        data = e.encode(pairs=batch)
Beispiel #38
0
def approximate_mono_image(img, num_coeffs=None, scale_factor=1):
    """
    Approximates a single channel image by using only the first coefficients of the DCT.
     First, the image is chopped into 8x8 pixels patches and the DCT is applied to each patch.
     Then, if num_coeffs is provided, only the first K DCT coefficients are kept.
     If not, all the elements are quantized using the JPEG quantization matrix and the scale_factor.
     Finally, the resulting coefficients are used to approximate the original patches with the IDCT, and the image is
     reconstructed back again from these patches.
    :param img: Image to be approximated.
    :param num_coeffs: Number of DCT coefficients to use.
    :param scale_factor: Scale factor to use in the quantization step.
    :return: The approximated image.
    """

    # prevent against multiple-channel images
    if len(img.shape) != 2:
        raise ValueError('Input image must be a single channel 2D array')

    # shape of image
    height = img.shape[0]
    width = img.shape[1]
    if (height % 8 != 0) or (width % 8 != 0):
        raise ValueError("Image dimensions (%s, %s) must be multiple of 8" %
                         (height, width))

    # split into 8 x 8 pixels blocks
    img_blocks = [
        img[j:j + 8, i:i + 8]
        for (j,
             i) in itertools.product(xrange(0, height, 8), xrange(0, width, 8))
    ]

    # DCT transform every 8x8 block
    dct_blocks = [cv.dct(img_block) for img_block in img_blocks]

    if num_coeffs is not None:
        # keep only the first K DCT coefficients of every block
        reduced_dct_coeffs = [
            utils.zig_zag(dct_block, num_coeffs) for dct_block in dct_blocks
        ]
    else:
        # quantize all the DCT coefficients using the quantization matrix and the scaling factor
        reduced_dct_coeffs = [
            np.round(dct_block / (utils.jpeg_quantiz_matrix * scale_factor))
            for dct_block in dct_blocks
        ]

        # and get the original coefficients back
        reduced_dct_coeffs = [
            reduced_dct_coeff * (utils.jpeg_quantiz_matrix * scale_factor)
            for reduced_dct_coeff in reduced_dct_coeffs
        ]

    # IDCT of every block
    rec_img_blocks = [
        cv.idct(coeff_block) for coeff_block in reduced_dct_coeffs
    ]

    # reshape the reconstructed image blocks
    rec_img = []
    for chunk_row_blocks in utils.chunks(rec_img_blocks, width / 8):
        for row_block_num in xrange(8):
            for block in chunk_row_blocks:
                rec_img.extend(block[row_block_num])
    rec_img = np.array(rec_img).reshape(height, width)

    return rec_img