Exemple #1
0
def test(model_input, labels, model, loss_fn=None, batch_size=32):
    """

    Args:
        model_input: list of tuples containing input to model
        labels: list of tuples containing labels corresponding to model input for training
        model: (torch.nn.Module) the neural network
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        batch_size: maximum batch_size

    Returns:
        metrics: dict
    """

    metrics = {}
    for batch_input, batch_labels in zip(grouper(model_input, batch_size),
                                         grouper(labels, batch_size)):
        batch_input = list(
            filter(lambda x: x is not None,
                   batch_input))  # remove None objects introduced by grouper
        batch_labels = list(
            filter(lambda x: x is not None,
                   batch_labels))  # remove None objects introduced by grouper

        batch_metrics = test_batch(batch_input,
                                   batch_labels,
                                   model,
                                   loss_fn=loss_fn)
        add_dict(metrics, batch_metrics)

    return metrics
Exemple #2
0
 def test_grouper(self):
     self.assertEqual(
         [list(g) for g in utils.grouper(python_utils.RANGE(7), 3)],
         [[0, 1, 2], [3, 4, 5], [6, None, None]])
     # Returns an iterable of iterables, so we need to combine them into
     # strings for easier comparison.
     self.assertEqual(
         [''.join(g) for g in utils.grouper('ABCDEFG', 3, fillvalue='x')],
         ['ABC', 'DEF', 'Gxx'])
 def test_epoch_in_batches(self, batch_size):
     test_list = list(range(len(self.og.test.images)))
     np.random.shuffle(test_list)
     for batch_i in grouper(test_list, batch_size):
         batch = [(self.og.test.images[i], self.og.test.labels[i])
                  for i in batch_i if i is not None]
         yield zip(*batch)
Exemple #4
0
def main(args):
    global DEBUG
    if len(args) == 1:
        # no args - repl
        while True:
            print 'que?>',
            try:
                print google_it(raw_input())
            except EOFError:
                break
            except:
                import traceback
                traceback.print_exc()
    else:
        # test mode
        DEBUG = False
        print 'Loading testfile...'
        tests = filter(bool, open(args[1]).read().split('\n'))

        print len(tests), 'tests'
        for clue, answer in utils.grouper(2, tests):
            clue = clue.split('~!clue')[1]
            answer = answer.split("~!answer")[1]
            try:
                print '----------------------------------------------------------------'
                print 'clue:', clue
                print 'correct:', answer
                print 'eubank:', google_it(clue)
            except KeyboardInterrupt:
                sys.exit(0)
            except:
                import traceback
                traceback.print_exc()
Exemple #5
0
def main():
    description = 'Split a FASTA file into multiple subfiles.'
    parser = ArgumentParser(description=description,
                            parents=[get_default_argument_parser()])
    parser.add_argument('-f', '--in-format',
                        default=_DEFAULT_FMT,
                        help="A biopython file format string.")
    parser.add_argument('-n', '--num-files', type=int,
                        default=_DEFAULT_N,
                        help=("The number of splits. "
                              "DEFAULT=%d") % _DEFAULT_N)
    parser.add_argument('in_path', nargs='?', default=None,
                        help=("The path of the file to be read in. "
                              "If no argument given, reads from STDIN."))
    parser.add_argument('out_pattern', default=None,
                        help=("Output file names format string. "
                              "Must contain one '%%d' for the file number."))
    args = parser.parse_args()

    if args.in_path is None:
        record_parser = SeqIO.parse(sys.stdin, args.in_format)
    else:
        record_parser = SeqIO.parse(args.in_path, args.in_format)

    write_multithread(grouper(record_parser, 100),
                      lambda recs, handle:
                          SeqIO.write(recs, handle, args.in_format),
                      args.out_pattern, n=args.num_files)
Exemple #6
0
def generate_predictions(model, lemmas, tags, batch_size=32):
    """Returns predicted inflected forms for given lemmas and tags."""

    lemmas_indices = model.vocab.words_to_indices(lemmas,
                                                  start_char=True,
                                                  stop_char=True)
    tags_indices = model.vocab.tag_to_indices(tags)
    model_input = list(zip(lemmas_indices, tags_indices))

    predictions = []

    for batch_input in grouper(model_input, batch_size):
        batch_input = list(
            filter(lambda x: x is not None,
                   batch_input))  # remove None objects introduced by grouper

        # set model to evaluating mode
        model.eval()

        # compute model output and loss
        p_ws, a_ls, p_gens = model(*zip(*batch_input))
        batch_predictions = [
            word.split(model.vocab.STOP_CHAR)[0]
            for word in model.vocab.indices_to_word(p_ws.argmax(2))
        ]
        predictions += batch_predictions

    return predictions
Exemple #7
0
def insert(rows):
    """Insert/Bulk insert values into the table.

    Parameter
    --------
    rows : str
        A long string equal to the number of columns in the database
        setup. Each column value is separated by a comma and or by
        delineating each row with a bracket.
    """
    # TODO Try to handle special characters that are difficult
    global no_cols
    if no_cols is None:
        no_cols = len(get_one()[0])
    rd = csv.DictReader(io.StringIO(rows))
    try:
        # TODO Figure out what errors could occur
        dta = [item.rstrip(")").lstrip(" (") for item in rd.fieldnames]
        data = list(utils.grouper(no_cols, dta))
        fields = ("?, " * no_cols).rstrip(", ")
        command = "INSERT INTO t1 VALUES (%s)" % fields
        db.executemany(command, data)
    except:
        raise
    db.commit()
    return "Successfully inserted %s" % rows
Exemple #8
0
    def train(self, sentences, total_words=None, word_count=0, chunksize=100):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of utf8 strings.

        """
        logger.info("training model on %i vocabulary and %i features" % (len(self.vocab), self.layer1_size))
        if not self.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")

        start, next_report = time.time(), 1.0
        if not total_words:
            total_words = sum(v.count for v in self.vocab.itervalues())
        # convert input string lists to Vocab objects (or None for OOV words)
        no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences)
        # run in chunks of e.g. 100 sentences (= 1 job) 
        for job in utils.grouper(no_oov, chunksize):
            # update the learning rate before every job
            alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count / total_words))
            # how many words did we train on? out-of-vocabulary (unknown) words do not count
            job_words = sum(train_sentences(self, sentence, alpha) for sentence in job)
            word_count += job_words
            # report progress
            elapsed = time.time() - start
            if elapsed >= next_report:
                logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" %
                    (100.0 * word_count / total_words, alpha, word_count / elapsed if elapsed else 0.0))
                next_report = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
        elapsed = time.time() - start
        logger.info("training on %i words took %.1fs, %.0f words/s" %
            (word_count, elapsed, word_count / elapsed if elapsed else 0.0))
        return word_count
def main():
    logger = configure_logging('parse_serverstatus')
    client = InfluxDBClient(host=args.influxdb_host, ssl=args.ssl, verify_ssl=False, port=8086, database=args.database)
    with open(args.input_file, 'r') as f:
        for line_number, chunk in enumerate(grouper(f, args.batch_size)):
            # print(line_number)
            json_points = []
            for line in chunk:
                # zip_longest will backfill any missing values with None, so we need to handle this, otherwise we'll miss the last batch
                if line:
                    try:
                        server_status_json = json.loads(line)
                        # print((line_number + 0) * _BATCH_SIZE)
                        # print((line_number + 1) * _BATCH_SIZE)
                        common_metric_data = get_metrics("serverstatus", server_status_json, common_metrics, line_number)
                        json_points.append(create_point(*common_metric_data))
                        wiredtiger_metric_data = get_metrics("serverstatus_wiredtiger", server_status_json, wiredtiger_metrics, line_number)
                        json_points.append(create_point(*wiredtiger_metric_data))
                        # for metric_data in get_metrics(server_status_json, common_metrics, line_number):
                        #     import ipdb; ipdb.set_trace()
                        #     print(json_points)
                        #     json_points.append(create_point(*metric_data))
                        # # for metric in get_metrics(server_status_json, wiredtiger_metrics, line_number):
                        #     json_points.append(create_point(*metric))
                        # for metric in get_metrics(server_status_json, mmapv1_metrics, line_number):
                        #     json_points.append(create_point(*metric))
                    except ValueError:
                        logger.error("Line {} does not appear to be valid JSON - \"{}\"".format(line_number, line.strip()))
            write_points(logger, client, json_points, line_number)
Exemple #10
0
def get_context(title, word_to_synset, nasari):
    context = []
    # Spezziamo il titolo per evitare di generare troppe combinazioni di sensi
    # Il secondo parametro indica il numero di parole del titolo da tenere in considerazione per determinare il contesto
    # Questo numero si può cambiare
    for chunk in utils.grouper(title, 6):

        print("chunk", chunk)
        babel_ids = get_babel_ids(chunk, word_to_synset)

        # Possibili combinazioni di sensi attraverso il prodotto cartesiano
        lista_ids = list(itertools.product(*babel_ids))
        print("lunghezza combinazioni", len(lista_ids))

        max_sim_tup = 0
        for word in chunk:
            # Inizializziamo la tupla migliore con i primi significati che esistono in Nasari
            best_tup_ids = get_vector(word, word_to_synset, nasari)

        for tuple_ids in lista_ids:  # DUE METODI PER MISURARE LA SIMILARITA'
            # sim_tup = similarity_tuple_intersection(tuple_ids, nasari)
            sim_tup = similarity_tuple(tuple_ids, nasari)
            if sim_tup > max_sim_tup:
                max_sim_tup = sim_tup
                best_tup_ids = tuple_ids

        # Costruisce il contesto del chunk attuale
        for best_id in best_tup_ids:
            vect = nasari.get(
                best_id
            )  # Estraiamo da Nasari il vettore dei migliori significati
            if vect is not None:
                context.append(vect)

    return clean_context(context)
Exemple #11
0
    def _unblock(self, bot, update):
        message = update.message

        if message.from_user.id != self._admin_id and not is_user_group_admin(bot, message.from_user.id,
                                                                              message.chat_id, self._admin_id):
            message.reply_text(text=self._ADMIN_RESTRICTION_MESSAGE, quote=False)
            return

        blocked_stickerpacks = self._get_blocked_stickerpacks()

        packs_list = []
        buttons = []

        if blocked_stickerpacks:
            for index, stickerpack in enumerate(blocked_stickerpacks, start=1):
                packs_list.append(f'{index}. [{stickerpack.name}]({self._get_stickers_link(stickerpack.name)})')
                buttons.append(
                    InlineKeyboardButton(
                        text=str(index),
                        callback_data=set_callback_data(stickerpack.id))
                )

            response_text = '*Заблокированные стикерпаки:*\n{}\n\nКакой *разблокировать*?'.format("\n".join(packs_list))

            keyboard = grouper(buttons, 5)  # в одном ряду будет 5 кнопок, так как текст на каждой из них короткий
            reply_markup = InlineKeyboardMarkup(keyboard, one_time_keyboard=True)
        else:
            response_text = self._NO_STICKERPACKS_BLOCKED_MESSAGE
            reply_markup = None

        message.reply_text(text=response_text, parse_mode=ParseMode.MARKDOWN, reply_markup=reply_markup, quote=False)
def compute_descriptors(infile, descriptor_types):
    """Reads low-level descriptors from DenseTracks."""

    LEN_LINE = 436

    POS_IDXS = [1, 2, 0]        # Position coordinates (X, Y, T).
    NORM_POS_IDXS = [7, 8, 9]   # Normalized position coordinates (X, Y, T).

    dense_tracks = subprocess.Popen(
        [DENSE_TRACK, infile],
        stdout=subprocess.PIPE)

    for lines in grouper(dense_tracks.stdout, NR_DESCRIPTORS):
        all_descs = np.vstack([
            map(float, line.split())
            for line in lines
            if line is not None]
        ).astype(np.float32)

        assert all_descs.shape[0] <= NR_DESCRIPTORS
        assert all_descs.shape[1] == LEN_LINE

        positions = all_descs[:, POS_IDXS]
        normalized_positions = all_descs[:, NORM_POS_IDXS]
        descriptors = {
            desc_type: all_descs[:, DESC_IDXS[desc_type]]
            for desc_type in descriptor_types}

        yield positions, normalized_positions, descriptors
Exemple #13
0
 def __iter__(self):
     if self.chunksize:
         for chunk in utils.grouper(self.corpus, self.chunksize):
             for transformed in self.obj.__getitem__(chunk, chunksize=None):
                 yield transformed
     else:
         for doc in self.corpus:
             yield self.obj[doc]
Exemple #14
0
 def test_epoch_in_batches(self, batch_size):
     test_list = list(range(len(self.og['test']['images'])))
     np.random.shuffle(test_list)
     for batch_i in grouper(test_list, batch_size):
         batch = [(self.read_preprocess(self.og['test']['images'][i]),
                   self.og['test']['labels'][i])
                 for i in batch_i if i is not None]
         yield zip(*batch)
Exemple #15
0
    def train(self, sentences, total_words=None, word_count=0, chunksize=100):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of utf8 strings.

        """
        logger.info("training model with %i workers on %i vocabulary and %i features" % (self.workers, len(self.vocab), self.layer1_size))

        if not self.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")

        start, next_report = time.time(), [1.0]
        word_count, total_words = [word_count], total_words or sum(v.count for v in self.vocab.itervalues())
        jobs = Queue(maxsize=2 * self.workers)  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock()  # for shared state (=number of words trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = matutils.zeros_aligned(self.layer1_size, dtype=REAL)  # each thread must have its own work memory

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                job_words = sum(train_sentence(self, sentence, alpha, work) for sentence in job)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" %
                            (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0))
                        next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue
        no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences)
        for job_no, job in enumerate(utils.grouper(no_oov, chunksize)):
            logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
        for _ in xrange(self.workers):
            jobs.put(None)  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        logger.info("training on %i words took %.1fs, %.0f words/s" %
            (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0))

        return word_count[0]
Exemple #16
0
 def test_epoch_in_batches(self, batch_size):
     test_list = list(range(len(self.og.test.images)))
     np.random.shuffle(test_list)
     for batch_i in grouper(test_list, batch_size):
         batch = [(np.reshape(
             resize(np.reshape(self.og.test.images[i], (28, 28)), (32, 32),
                    mode='constant'), (1024, )), self.og.test.labels[i])
                  for i in batch_i if i is not None]
         yield zip(*batch)
Exemple #17
0
def import_json():
    for g in grouper(1000,sys.stdin):
        try:
            Model.database.bulk_save([json.loads(l) for l in g if l])
        except BulkSaveError as err:
            if any(d['error']!='conflict' for d in err.errors):
                raise
            else:
                logging.warn("conflicts for %r",[d['id'] for d in err.errors])
Exemple #18
0
    def reduce(key, values):
        # The reduce() function must be static, so we manually create a "cls"
        # variable instead of changing the function into a classmethod.
        cls = PopulateFirebaseAccountsOneOffJob

        if key == cls.POPULATED_KEY:
            yield (cls.AUDIT_KEY, len(values))
            return
        elif key in (cls.SUPER_ADMIN_ACK, cls.SYSTEM_COMMITTER_ACK):
            yield (key, values)
            return

        # NOTE: This is only sorted to make unit testing easier.
        user_fields = sorted(ast.literal_eval(v) for v in values)
        user_records = [
            firebase_auth.ImportUserRecord(
                uid=auth_id,
                email=email,
                email_verified=True,
                custom_claims=('{"role":"%s"}' %
                               feconf.FIREBASE_ROLE_SUPER_ADMIN
                               if user_is_super_admin else None))
            for auth_id, _, email, user_is_super_admin in user_fields
        ]

        # The Firebase Admin SDK places a hard-limit on the number of users that
        # can be "imported" in a single call. To compensate, we break up the
        # users into chunks.
        offsets = python_utils.RANGE(
            0, len(user_records), cls.MAX_USERS_FIREBASE_CAN_IMPORT_PER_CALL)
        results = (cls.populate_firebase(
            [r for r in record_group
             if r is not None]) for record_group in utils.grouper(
                 user_records, cls.MAX_USERS_FIREBASE_CAN_IMPORT_PER_CALL))

        assocs_to_create = []
        for offset, (result, exception) in python_utils.ZIP(offsets, results):
            if exception is not None:
                yield (cls.ERROR_KEY, repr(exception))
            else:
                successful_indices = set(
                    python_utils.RANGE(result.success_count +
                                       result.failure_count))
                for error in result.errors:
                    successful_indices.remove(error.index)
                    debug_info = 'Import user_id=%r failed: %s' % (
                        user_fields[offset + error.index][1], error.reason)
                    yield (cls.ERROR_KEY, debug_info)
                assocs_to_create.extend(
                    auth_domain.AuthIdUserIdPair(*user_fields[offset + i][:2])
                    for i in successful_indices)

        if assocs_to_create:
            firebase_auth_services.associate_multi_auth_ids_with_user_ids(
                assocs_to_create)
            yield (cls.SUCCESS_KEY, len(assocs_to_create))
Exemple #19
0
def row_batch_iter(rows, min_size, n):
    if cfg.group_length:
        rows.sort(key=lambda row: len(row[0]))

    csv_batches = list(utils.grouper(cfg.batch_size, rows, None))
    random.shuffle(csv_batches)
    for i in range(n):
        for batch in csv_batches:
            if is_batch_valid(batch):
                yield pack(batch, min_size)
Exemple #20
0
def read_slr(fh):
    stats = fh.readline()
    seqs = []

    for l in utils.grouper(fh, 2):
        name = l[0].rstrip()
        seq = l[1].rstrip()
        seqs.append(SeqRecord(id=name, seq=Seq(seq), description=""))
        
    return seqs
Exemple #21
0
def retrieve_nodes_given_sentences(out_fname, batch_size, all_input_sentences,
                                   glosses_bnids, glosses_feats, topk):
    """
        out_fname(str):                     Output file to write retrieved node ids to.
        batch_size(int):                    Batch size for Sentence BERT.
        all_input_sentences(list[str]):     All input sentences loaded from `input_file`.
        glosses_bnids(list[str]):           All gloss BNids loaded from `args.glosses_bnids`. Aligned with `glosses_feats`.
        glosses_feats(numpy.array):         Numpy array with VisualSem gloss features computed with Sentence BERT.
        topk(int):                          Number of nodes to retrieve for each input sentence.
    """
    if os.path.isfile(out_fname):
        raise Exception(
            "File already exists: '%s'. Please remove it manually to avoid tampering."
            % out_fname)

    n_examples = len(all_input_sentences)
    print("Number of input examples to extract BNIDs for: ", n_examples)
    model = SentenceTransformer('distiluse-base-multilingual-cased')

    with open(out_fname, 'w', encoding='utf8') as fh_out:
        ranks_predicted = []
        for idxs_ in grouper(batch_size, range(n_examples)):
            idxs = []
            queries = []
            for i in idxs_:
                if not i is None:
                    idxs.append(i)
                    queries.append(all_input_sentences[i])

            queries_embs = model.encode(queries, convert_to_tensor=True)
            queries_embs = queries_embs.cuda()
            scores = util.pytorch_cos_sim(queries_embs, glosses_feats)
            scores = scores.cpu().numpy()

            ranks = numpy.argsort(
                scores)  # sort scores by cosine similarity (low to high)
            ranks = ranks[:, ::-1]  # sort by cosine similarity (high to low)
            for rank_idx in range(len(idxs[:ranks.shape[0]])):
                bnids_predicted = []
                for rank_predicted in range(topk * 10):
                    bnid_pred = glosses_bnids[ranks[rank_idx, rank_predicted]]
                    bnid_pred_score = scores[rank_idx, ranks[rank_idx,
                                                             rank_predicted]]
                    if not bnid_pred in bnids_predicted:
                        bnids_predicted.append((bnid_pred, bnid_pred_score))
                    if len(bnids_predicted) >= topk:
                        break

                # write top-k predicted BNids
                for iii, (bnid, score) in enumerate(bnids_predicted[:topk]):
                    fh_out.write(bnid + "\t" + "%.4f" % score)
                    if iii < topk - 1:
                        fh_out.write("\t")
                    else:  # iii == topk-1
                        fh_out.write("\n")
Exemple #22
0
 def __init__(self, horn_pointing=False, siamfile=None):
     self.horn_pointing = horn_pointing
     if siamfile is None:
         siamfile = private.siam
     f = open(siamfile)
     lines = f.readlines()
     self.siam = {}
     for line in grouper(4,lines[1:]):
         chtag = line[0].split()[0]
         m = np.array(np.matrix(';'.join(line[1:])))
         self.siam[chtag] = m
Exemple #23
0
 def buffered_read(self, fnames):
     '''Read packed batches from data with each batch having lines of similar lengths'''
     for line_collection in self.buffered_read_sorted_lines(fnames):
         batches = [
             b for b in utils.grouper(cfg.batch_size, line_collection)
         ]
         random.shuffle(batches)
         for batch in batches:
             ret = self.pack(batch)
             if ret is not None:
                 yield ret
Exemple #24
0
def import_old_json():
    for g in grouper(1000,sys.stdin):
        docs = [json.loads(l) for l in g if l]
        for d in docs:
            del d['doc_type']
            for k,v in d.iteritems():
                if k[-2:]=='id' or k in ('rtt','rtu'):
                    d[k]=v[1:]
            for field in ['ats','fols','frs']:
                if field in d and isinstance(d[field],list):
                    d[field] = [u[1:] for u in d[field]]
        Model.database.bulk_save(docs)
Exemple #25
0
def join(paths, output_path, batch_size=100):
    ''' Stitch a bunch of chunks into a single file '''
    incomplete_output_path = f'{output_path}.incomplete'
    with open(incomplete_output_path, 'wt') as output_file:
        try:
            # Concatenate a batch of files at a time, in case the file list is too long
            for batch in grouper(paths, batch_size):
                subprocess.check_call(['cat'] + batch,
                                      stdout=output_file,
                                      stderr=subprocess.PIPE)
        except subprocess.CalledProcessError:
            raise RuntimeError(f'Unable to join files into {output_path}')
        os.rename(incomplete_output_path, output_path)
Exemple #26
0
def xfory(price_info, units):
    """ function to discount per groups. if you pay Y you get X """
    total = 0
    x = price_info.get('x')
    y = price_info.get('y')
    price = price_info.get('unitPrice')

    for group in grouper(x, range(0, units)):
        has_discount = len(group) == x
        per_unit = price if not has_discount else y / x * price
        total = total + (per_unit * len(group))

    return total / units
Exemple #27
0
def test(net, img, hyperparams):
    """
    Test a model on a specific image
    """
    net.eval()
    patch_size = hyperparams['patch_size']
    center_pixel = hyperparams['center_pixel']
    batch_size, device = hyperparams['batch_size'], hyperparams['device']
    n_classes = hyperparams['n_classes']

    kwargs = {
        'step': hyperparams['test_stride'],
        'window_size': (patch_size, patch_size)
    }
    probs = np.zeros(img.shape[:2] + (n_classes, ))

    iterations = count_sliding_window(img, **kwargs) // batch_size
    for batch in tqdm(grouper(batch_size, sliding_window(img, **kwargs)),
                      total=(iterations),
                      desc="Inference on the image"):
        with torch.no_grad():
            if patch_size == 1:
                data = [b[0][0, 0] for b in batch]
                data = np.copy(data)
                data = torch.from_numpy(data)
            else:
                data = [b[0] for b in batch]
                data = np.copy(data)
                data = data.transpose(0, 3, 1, 2)
                data = torch.from_numpy(data)
                # data = data.unsqueeze(1)              # 3DConv时执行

            indices = [b[1:] for b in batch]
            data = data.to(device)
            output = net(data)
            if isinstance(output, tuple):
                output = output[0]
            output = output.to('cpu')  # 将cpu 改为 cuda

            if patch_size == 1 or center_pixel:
                output = output.numpy()
            else:
                output = np.transpose(output.numpy(), (0, 2, 3, 1))
            for (x, y, w, h), out in zip(indices, output):
                if center_pixel:
                    # probs[x, y] += out
                    probs[x + w // 2, y + h // 2] += out
                    # probs[x:x + w, y:y + h] += out
                else:
                    probs[x:x + w, y:y + h] += out
    return probs
Exemple #28
0
def test(net, img, hyperparams):
    """
    Test a model on a specific image
    """
    net.eval()
    patch_size = hyperparams["patch_size"]
    center_pixel = hyperparams["center_pixel"]
    batch_size, device = hyperparams["batch_size"], hyperparams["device"]
    n_classes = hyperparams["n_classes"]

    kwargs = {
        "step": hyperparams["test_stride"],
        "window_size": (patch_size, patch_size),
    }
    probs = np.zeros(img.shape[:2] + (n_classes,))

    iterations = count_sliding_window(img, **kwargs) // batch_size
    for batch in tqdm(
        grouper(batch_size, sliding_window(img, **kwargs)),
        total=(iterations),
        desc="Inference on the image",
    ):
        with torch.no_grad():
            if patch_size == 1:
                data = [b[0][0, 0] for b in batch]
                data = np.copy(data)
                data = torch.from_numpy(data)
            else:
                data = [b[0] for b in batch]
                data = np.copy(data)
                data = data.transpose(0, 3, 1, 2)
                data = torch.from_numpy(data)
                data = data.unsqueeze(1)

            indices = [b[1:] for b in batch]
            data = data.to(device)
            output = net(data)
            if isinstance(output, tuple):
                output = output[0]
            output = output.to("cpu")

            if patch_size == 1 or center_pixel:
                output = output.numpy()
            else:
                output = np.transpose(output.numpy(), (0, 2, 3, 1))
            for (x, y, w, h), out in zip(indices, output):
                if center_pixel:
                    probs[x + w // 2, y + h // 2] += out
                else:
                    probs[x : x + w, y : y + h] += out
    return probs
 def insert_blat_hits_into_db(self, blat_output, hits_per_chunk=50000):
     """
     Insert peptide hits from BLAT into db.
     """
     for values in grouper(hits_per_chunk, blat_output):
         self.db.executemany(
             "INSERT INTO mappings VALUES (?, ?, ?, ?, ?, ?)", values)
     self.db.execute(
         "INSERT INTO peptides (peptide) SELECT DISTINCT peptide FROM mappings"
     )
     self.db.execute(
         "CREATE INDEX i_peptides_disc ON peptides(discriminative_taxid)")
     self.db.execute("CREATE INDEX i_mappings_targets ON mappings(target)")
     self.db.commit()
Exemple #30
0
def import_fasta(fasta_file, tfhost, tfpath):

    tfserver = 'http://{}{}'.format(tfhost, tfpath)
    seqiter = read_sequences(fasta_file)

    for batch in grouper(seqiter):
        ids, seqs = zip(*batch)
        preds = infer_batch(seqs, tfserver)
        for i, s, p in zip(ids, seqs, preds):
            p['id'] = i
            p['seq'] = s
            print(p)

    return
Exemple #31
0
def test(net, img, args):
    """
    Test a model on a specific image
    """
    net.eval()
    patch_size = args.patch_size
    center_pixel = args.center_pixel
    batch_size, device = args.batch_size, torch.device(args.device)
    n_classes = args.n_classes

    kwargs = {
        'step': args.test_stride,
        'window_size': (patch_size, patch_size)
    }
    probs = np.zeros(img.shape[:2] + (n_classes, ))

    iterations = utils.count_sliding_window(img, **kwargs) // batch_size
    for batch in tqdm(utils.grouper(batch_size,
                                    utils.sliding_window(img, **kwargs)),
                      total=(iterations),
                      desc="Inference on the image"):
        with torch.no_grad():
            if patch_size == 1:
                data = [b[0][0, 0] for b in batch]
                data = np.copy(data)
                data = torch.from_numpy(data)
            else:
                data = [b[0] for b in batch]
                data = np.copy(data)
                data = data.transpose(0, 3, 1, 2)
                data = torch.from_numpy(data)
                data = data.unsqueeze(1)

            indices = [b[1:] for b in batch]
            data = data.to(device)
            output = net(data)
            if isinstance(output, tuple):
                output = output[0]
            output = output.to('cpu')

            if patch_size == 1 or center_pixel:
                output = output.numpy()
            else:
                output = np.transpose(output.numpy(), (0, 2, 3, 1))
            for (x, y, w, h), out in zip(indices, output):
                if center_pixel:
                    probs[x + w // 2, y + h // 2] += out
                else:
                    probs[x:x + w, y:y + h] += out
    return probs
Exemple #32
0
    def set_item_candidates(self, n_user, n_item, train_data, eval_data,
                            path_list_dict):
        """Construct the sampling distrbiutions for negative/pseudo-labelled instances for each user
        """
        all_users = tuple(set(train_data[:, 0]))
        self.all_users = all_users

        self.n_item = n_item
        self.all_items = set(range(n_item))
        self.neg_c_dict_user = self._build_freq_dict(
            np.concatenate([train_data[:, 0], eval_data[:, 0]]),
            self.all_users)
        self.neg_c_dict_item = self._build_freq_dict(
            np.concatenate([train_data[:, 1], eval_data[:, 1]]),
            self.all_items)

        item_cands = tuple(self.neg_c_dict_item.keys())
        F = np.array(tuple(
            self.neg_c_dict_item.values()))**self.cfg.plabel.neg_pn
        sort_inds = np.argsort(F)
        item_cands = [item_cands[i] for i in sort_inds]
        F = F[sort_inds]
        F = (F / F.sum()).cumsum()
        self.item_freq = (item_cands, F)

        for u, i in tqdm(train_data[:, 0:2]):
            self.user_seed_dict[u].add(i)

        path = hydra.utils.to_absolute_path(self.cfg.reachable_items_path)
        logger.info("calculating reachable items for users")
        self._setup_dst_dict(path_list_dict)
        item_dist_dict = {}
        src_itr = map(
            lambda iu: (
                all_users[iu],
                tuple(self.user_seed_dict[all_users[iu]]),
                self.dst_dict,
                self.neg_c_dict_item,
                self.cfg.plabel.pl_pn,
            ),
            range(len(all_users)),
        )
        grouped = grouper(self.cfg.plabel.chunk_size,
                          src_itr,
                          squash=set([2, 3]))
        with mp.Pool(self.cfg.plabel.par) as pool:
            for idd in pool.imap_unordered(compute_reachable_items_, grouped):
                item_dist_dict.update(idd)
        self.item_dist_dict = item_dist_dict
Exemple #33
0
    def command_service(self, rawCommand):
        """
        Parse raw input and execute specified function with args

        :param rawCommand: csv string from Matlab/Simulink of the form:
                'command, namedArg1, arg1, namedArg2, arg2, ..., namedArgN, argN'
        :return: the command and arguments as a dictionary
        """
        pack = [x.strip() for x in split('[,()]*', rawCommand.strip())]
        raw_cmd = pack[0]
        argDict = {key: literal_eval(value) for key, value in utils.grouper(pack[1:], 2)}
        cmd = self.mapInterface.commands[raw_cmd]
        ret = cmd(**argDict)
        logger.info("Command '{}' run with args {}".format(raw_cmd, argDict))
        return raw_cmd, ret
Exemple #34
0
    def train(self,triples, total_triples=None, triples_count = 0, chunksize=1000):
        if not self.vocab or not self.vocab_rel:
            raise RuntimeError("you must first build entity and relation vocabulary before training the model")
        start,next_report = time.time(),[1.0]
        triples_count = [triples_count]
        total_triples = total_triples or int(sum(1 for v in triples))
        jobs = Queue(maxsize=2*self.workers)
        lock = threading.Lock()

        def worker_train():
            work = zeros(self.layer1_size, dtype=REAL)
            detR = zeros((self.layer1_size,self.layer1_size),dtype=REAL)
            # neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            while True:
                job = jobs.get()
                if job is None:
                    break
                alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * triples_count[0] / total_triples))
                job_triples = self._get_job_triples(alpha,job,work,detR)
                with lock:
                    triples_count[0] += job_triples
                    elapsed = time.time() - start
                    if elapsed>= next_report[0]:
                        logger.info("PROGRESS: at %.2f%% triplrs, alpha %.05f, %.0f triples/s" %
                            (100.0 * triples_count[0] / total_triples, alpha, triples_count[0] / elapsed if elapsed else 0.0))
                        next_report[0] = elapsed + 1.0

        workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue
        for job_no, job in enumerate(utils.grouper(self._prepare_triples(triples), chunksize)):
            logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
        for _ in xrange(self.workers):
            jobs.put(None)  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        logger.info("training on %i triples took %.1fs, %.0f triples/s" %
            (triples_count[0], elapsed, triples_count[0] / elapsed if elapsed else 0.0))
        self.syn0norm = None
        return triples_count[0]
def split_batch_by_box_num(batches, box_batch_size):
    batchIdxs, batch_datas = batches
    newdata = []
    num_gpu = len(
        batch_datas
    )  # each is a Dataset instance, d.data['img'] is a one item list

    num_boxes = [
        batch_datas[i].data['gt'][0]['boxes'].shape[0] for i in xrange(num_gpu)
    ]
    max_num_box = max(num_boxes)
    min_num_box = min(num_boxes)

    split_into_num_batch = int(math.ceil(max_num_box / float(box_batch_size)))

    # the indexes for each inner batch
    # the batch with not enough will fill with 0, the first box
    each_batch_selected_indexes = [
        grouper(range(num_boxes[i]), box_batch_size, fillvalue=0)
        for i in xrange(num_gpu)
    ]

    # still need to handle some batch has not enough batch
    t2 = []
    for b in each_batch_selected_indexes:
        if len(b) < split_into_num_batch:
            need = split_into_num_batch - len(b)
            b = b + [[0 for _ in xrange(box_batch_size)] for _ in xrange(need)]
        t2.append(b)

    for i in xrange(split_into_num_batch):
        this_datas = []
        for j in xrange(num_gpu):
            selected = each_batch_selected_indexes[j][i]
            temp = {
                "imgs": [batch_datas[j].data['imgs'][0]],
                "imgdata": [batch_datas[j].data['imgdata'][0]],
                "resized_image": [batch_datas[j].data['resized_image'][0]],
                'gt': [{
                    "boxes":
                    batch_datas[j].data['gt'][0]['boxes'][selected, :],
                    #"labels": batch_datas[j].data['gt'][0]['labels'][selected],
                }],
            }
            this_datas.append(temp)
        newdata.append(
            (batchIdxs, [Dataset(this_data) for this_data in this_datas]))
    return newdata
Exemple #36
0
def build_map(left_edge: Tile, top_edge: Tile) -> list[list[Tile]]:
    rows = [top_edge]

    for row_index, row in enumerate(left_edge[1:], 1):
        rows.append([row])
        prev_row = rows[row_index - 1]

        current: Tile = row

        for above, next_above in grouper(prev_row, 2):
            current = next(i for i in current.connections
                           if i != above and any(j == next_above
                                                 for j in i.connections))
            rows[row_index].append(current)

    return rows
Exemple #37
0
def main(reset=True):
    """
    Get a database and table
    Generate rows
    Insert chunks
    """
    db = dataset.connect(DATABASE_URL)
    if reset:
        db[TABLE_NAME].drop()

    table = db[TABLE_NAME]
    rows = generate_rows(*FILES)

    for group in grouper(rows, 1000, None):
        group = ifilter(bool, group)
        table.insert_many(group, types=TYPES)
def decodeNetwork(encoding):
    layers = []
    for layer_tuple in utils.grouper(4, encoding):
        filter_widths = [1, 3, 5, 7]
        filter_heights = [1, 3, 5, 7]
        num_filters = [24, 36, 48, 64]
        strides = [1, 2, 3, 1]

        filter_widths_i, filter_heights_i, num_filters_i, strides_i = layer_tuple
        
        filter_width = filter_widths[filter_widths_i]
        filter_height = filter_heights[filter_heights_i]
        num_filter = num_filters[num_filters_i]
        stride = strides[strides_i]

        layers.append(ConvolutionalLayer(kernel_size=[filter_width, filter_height], stride=[stride, stride], num_filters=num_filter))
    return layers
Exemple #39
0
def read_and_translate(translator: inference.Translator, output_handler: output_handler.OutputHandler,
                       chunk_size: Optional[int], source: Optional[str] = None,
                       reference: Optional[str] = None,
                       dictionary: Optional[dict] = None) -> None:
    """
    Reads from either a file or stdin and translates each line, calling the output_handler with the result.

    :param output_handler: Handler that will write output to a stream.
    :param translator: Translator that will translate each line of input.
    :param chunk_size: The size of the portion to read at a time from the input.
    :param source: Path to file which will be translated line-by-line if included, if none use stdin.
    :param reference: Path to reference file.
    :param dictionary: dictionary to constrain translation.
    """
    source_data = sys.stdin if source is None else data_io.smart_open(source)
    reference_data = None if reference is None else data_io.smart_open(reference)

    batch_size = translator.batch_size
    if chunk_size is None:
        if translator.batch_size == 1:
            # No batching, therefore there is not need to read segments in chunks.
            chunk_size = C.CHUNK_SIZE_NO_BATCHING
        else:
            # Get a constant number of batches per call to Translator.translate.
            chunk_size = C.CHUNK_SIZE_PER_BATCH_SEGMENT * translator.batch_size
    else:
        if chunk_size < translator.batch_size:
            logger.warning("You specified a chunk size (%d) smaller than the batch size (%d). This will lead to "
                           "a degregation of translation speed. Consider choosing a larger chunk size." % (chunk_size,
                                                                                                           batch_size))

    logger.info("Translating...")

    total_time, total_lines = 0.0, 0
    for chunk, reference_chunk in itertools.zip_longest(grouper(source_data, chunk_size), grouper(reference_data, chunk_size)
                                            if reference_data is not None else [None]):
        chunk_time = translate(output_handler, chunk, translator, total_lines, reference_chunk)
        total_lines += len(chunk)
        total_time += chunk_time

    if total_lines != 0:
        logger.info("Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f",
                    total_lines, ceil(total_lines / batch_size), total_time,
                    total_time / total_lines, total_lines / total_time)
    else:
        logger.info("Processed 0 lines.")
Exemple #40
0
  def get(self):
    if not self.is_running():
      self.start()
    try:
      while self.is_running():
        if self.cur_batch_count == self.dataset.num_batches:
          self._stop()
          return

        samples = []
        for i in range(self.dataset.batch_size):
          # first get got the ApplyResult object,
          # then second get to get the actual thing (block till get)
          sample = self.queue.get(block=True).get()
          self.queue.task_done()
          samples.append(sample)

        # break the mini-batch into mini-batches for multi-gpu
        if self.is_multi_gpu:
          # a list of [frames, boxes, labels_arr, ori_boxes, box_keys]
          batches = []

          this_batch_idxs = range(len(samples))

          # pack these batches for each gpu
          this_batch_idxs_gpus = utils.grouper(
              this_batch_idxs, self.dataset.batch_size_per_gpu)
          batches = []
          for this_batch_idxs_per_gpu in this_batch_idxs_gpus:
            batches.append(self.dataset.collect_batch(
                samples, this_batch_idxs_per_gpu))

          batch = batches
        else:
          batch = self.dataset.collect_batch(samples)


        self.cur_batch_count += 1
        yield batch

    except Exception as e:  # pylint: disable=broad-except
      self._stop()
      _type, _value, _traceback = sys.exc_info()
      print("Exception in enqueuer.get: %s" % e)
      traceback.print_tb(_traceback)
      raise Exception
Exemple #41
0
def fetch_edges():
    Edges.database = connect("houtx_edges")
    User.database = connect("away_user")
    old_edges = set(int(row['id']) for row in Edges.database.paged_view("_all_docs",endkey="_"))
    uids = set(_users_from_scores())-old_edges
    settings.pdb()
    for g in grouper(100,uids):
        for user in twitter.user_lookup(g):
            if user is None or user.protected: continue
            try:
                edges = twitter.get_edges(user._id)
            except restkit.errors.Unauthorized:
                logging.warn("unauthorized!")
                continue
            except restkit.errors.ResourceNotFound:
                logging.warn("resource not found!?")
                continue
            edges.save()
            user.save()
            sleep_if_needed()
def compute_descriptors(infile, descriptor_type):
    """Reads low-level descriptors from DenseTracks."""

    LEN_LINE = 436
    POS_IDXS = [1, 2, 0]  # Positional coordinates (X, Y, T).

    dense_tracks = subprocess.Popen(
        ['./DenseTrack', infile], stdout=subprocess.PIPE)
    descriptor_idxs = DESC_IDXS[descriptor_type]

    for lines in grouper(dense_tracks.stdout, NR_DESCRIPTORS):

        all_descs = np.vstack([
            map(float, line.split())
            for line in lines
            if line is not None]
        ).astype(np.float32)

        assert all_descs.shape[0] <= NR_DESCRIPTORS
        assert all_descs.shape[1] == LEN_LINE

        yield all_descs[:, POS_IDXS], all_descs[:, descriptor_idxs]
Exemple #43
0
def main():

	parser = argparse.ArgumentParser()
	parser.add_argument('-num_hidden_units', type=int, default=1024)
	parser.add_argument('-num_hidden_layers', type=int, default=3)
	parser.add_argument('-dropout', type=float, default=0.5)
	parser.add_argument('-activation', type=str, default='tanh')
	parser.add_argument('-language_only', type=bool, default= False)
	parser.add_argument('-num_epochs', type=int, default=100)
	parser.add_argument('-model_save_interval', type=int, default=10)
	parser.add_argument('-batch_size', type=int, default=128)
	parser.add_argument('-word_vector', type=str, default='')
	args = parser.parse_args()

	questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines()
	answers_train = open('../data/preprocessed/answers_train2014_modal.txt', 'r').read().decode('utf8').splitlines()
	images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines()
	vgg_model_path = '../features/coco/vgg_feats.mat'
	maxAnswers = 1000
	questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, maxAnswers)

	#encode the remaining answers
	labelencoder = preprocessing.LabelEncoder()
	labelencoder.fit(answers_train)
	nb_classes = len(list(labelencoder.classes_))
	joblib.dump(labelencoder,'../models/labelencoder.pkl')

	features_struct = scipy.io.loadmat(vgg_model_path)
	VGGfeatures = features_struct['feats']
	print 'loaded vgg features'
	image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines()
	id_map = {}
	for ids in image_ids:
		id_split = ids.split()
		id_map[id_split[0]] = int(id_split[1])

        # Code to choose the word vectors, default is Goldberg but GLOVE is preferred
        if args.word_vector == 'glove':
            nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        else:
            nlp = English()

	print 'loaded ' + args.word_vector + ' word2vec features...'
	img_dim = 4096
	word_vec_dim = 300

	model = Sequential()
	if args.language_only:
		model.add(Dense(args.num_hidden_units, input_dim=word_vec_dim, init='uniform'))
	else:
		model.add(Dense(args.num_hidden_units, input_dim=img_dim+word_vec_dim, init='uniform'))
	model.add(Activation(args.activation))
	if args.dropout>0:
		model.add(Dropout(args.dropout))
	for i in xrange(args.num_hidden_layers-1):
		model.add(Dense(args.num_hidden_units, init='uniform'))
		model.add(Activation(args.activation))
		if args.dropout>0:
			model.add(Dropout(args.dropout))
	model.add(Dense(nb_classes, init='uniform'))
	model.add(Activation('softmax'))

	json_string = model.to_json()
	if args.language_only:
		model_file_name = '../models/mlp_language_only_num_hidden_units_' + str(args.num_hidden_units) + '_num_hidden_layers_' + str(args.num_hidden_layers)
	else:
		model_file_name = '../models/mlp_num_hidden_units_' + str(args.num_hidden_units) + '_num_hidden_layers_' + str(args.num_hidden_layers)		
	open(model_file_name  + '.json', 'w').write(json_string)

	print 'Compiling model...'
	model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
	print 'Compilation done...'
	
	print 'Training started...'
	for k in xrange(args.num_epochs):
		#shuffle the data points before going through them
		index_shuf = range(len(questions_train))
		shuffle(index_shuf)
		questions_train = [questions_train[i] for i in index_shuf]
		answers_train = [answers_train[i] for i in index_shuf]
		images_train = [images_train[i] for i in index_shuf]
		progbar = generic_utils.Progbar(len(questions_train))
		for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), 
											grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), 
											grouper(images_train, args.batch_size, fillvalue=images_train[-1])):
			X_q_batch = get_questions_matrix_sum(qu_batch, nlp)
			if args.language_only:
				X_batch = X_q_batch
			else:
				X_i_batch = get_images_matrix(im_batch, id_map, VGGfeatures)
				X_batch = np.hstack((X_q_batch, X_i_batch))
			Y_batch = get_answers_matrix(an_batch, labelencoder)
			loss = model.train_on_batch(X_batch, Y_batch)
			# fix for the Keras v0.3 issue #9
			progbar.add(args.batch_size, values=[("train loss", loss[0])])
		#print type(loss)
		if k%args.model_save_interval == 0:
			model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k))

	model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k))
Exemple #44
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-num_hidden_units_mlp', type=int, default=1024)
	parser.add_argument('-num_hidden_units_lstm', type=int, default=512)
	parser.add_argument('-num_hidden_layers_mlp', type=int, default=3)
	parser.add_argument('-num_hidden_layers_lstm', type=int, default=1)
	parser.add_argument('-dropout', type=float, default=0.5)
	parser.add_argument('-activation_mlp', type=str, default='tanh')
	parser.add_argument('-num_epochs', type=int, default=100)
	parser.add_argument('-model_save_interval', type=int, default=5)
	parser.add_argument('-batch_size', type=int, default=128)
	parser.add_argument('-word_vector', type=str, default='')
	#TODO Feature parser.add_argument('-resume_training', type=str)
	#TODO Feature parser.add_argument('-language_only', type=bool, default= False)
	args = parser.parse_args()

	word_vec_dim= 300
	img_dim = 4096
	max_len = 30
	nb_classes = 1000

	#get the data
	questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines()
	questions_lengths_train = open('../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines()
	answers_train = open('../data/preprocessed/answers_train2014_modal.txt', 'r').read().decode('utf8').splitlines()
	images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines()
	vgg_model_path = '../features/coco/vgg_feats.mat'

	max_answers = nb_classes
	questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, max_answers)
	questions_lengths_train, questions_train, answers_train, images_train = (list(t) for t in zip(*sorted(zip(questions_lengths_train, questions_train, answers_train, images_train))))

	#encode the remaining answers
	labelencoder = preprocessing.LabelEncoder()
	labelencoder.fit(answers_train)
	nb_classes = len(list(labelencoder.classes_))
	joblib.dump(labelencoder,'../models/labelencoder.pkl')
	
	image_model = Sequential()
	image_model.add(Reshape(input_shape = (img_dim,), dims=(img_dim,)))

	language_model = Sequential()
	if args.num_hidden_layers_lstm == 1:
		language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=False, input_shape=(max_len, word_vec_dim)))
	else:
		language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=True, input_shape=(max_len, word_vec_dim)))
		for i in xrange(args.num_hidden_layers_lstm-2):
			language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=True))
		language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=False))

	model = Sequential()
	model.add(Merge([language_model, image_model], mode='concat', concat_axis=1))
	for i in xrange(args.num_hidden_layers_mlp):
		model.add(Dense(args.num_hidden_units_mlp, init='uniform'))
		model.add(Activation(args.activation_mlp))
		model.add(Dropout(args.dropout))
	model.add(Dense(nb_classes))
	model.add(Activation('softmax'))

	json_string = model.to_json()
	model_file_name = '../models/lstm_1_num_hidden_units_lstm_' + str(args.num_hidden_units_lstm) + \
						'_num_hidden_units_mlp_' + str(args.num_hidden_units_mlp) + '_num_hidden_layers_mlp_' + \
						str(args.num_hidden_layers_mlp) + '_num_hidden_layers_lstm_' + str(args.num_hidden_layers_lstm)
	open(model_file_name + '.json', 'w').write(json_string)

	model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
	print 'Compilation done'

	features_struct = scipy.io.loadmat(vgg_model_path)
	VGGfeatures = features_struct['feats']
	print 'loaded vgg features'
	image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines()
	img_map = {}
	for ids in image_ids:
		id_split = ids.split()
		img_map[id_split[0]] = int(id_split[1])

        # Code to choose the word vectors, default is Goldberg but GLOVE is preferred
        if args.word_vector == 'glove':
            nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        else:
            nlp = English()

	print 'loaded ' + args.word_vector + ' word2vec features...'
	## training
	print 'Training started...'
	for k in xrange(args.num_epochs):

		progbar = generic_utils.Progbar(len(questions_train))

		for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), 
												grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), 
												grouper(images_train, args.batch_size, fillvalue=images_train[-1])):
			timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length
			X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps)
			X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures)
			Y_batch = get_answers_matrix(an_batch, labelencoder)
			loss = model.train_on_batch([X_q_batch, X_i_batch], Y_batch)
			# fix for the Keras v0.3 issue #9
			progbar.add(args.batch_size, values=[("train loss", loss[0])])

		
		if k%args.model_save_interval == 0:
			model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k))

	model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k))
                diff += 1.0 if cat_id_gold in nearest else 0.0
                print nearest, cat_id_gold
                confusion_mtx.setdefault(cat_id_gold, {})
                confusion_mtx[cat_id_gold].setdefault(nearest[0], 0)
                confusion_mtx[cat_id_gold][nearest[0]] += 1
            qout.put(diff)

    jobs = Queue(maxsize=50)
    qout = Queue(maxsize=20000)
    threads = [Thread(target=worker_infer) for _ in xrange(args.thread)]
    sent_num = 0
    for t in threads:
        t.daemon = True
        t.start()

    for job_no, job in enumerate(utils.grouper(prepare_sentences(), 100)):
        logger.info("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
        jobs.put(job)
        sent_num += len(job)
    logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())

    for _ in xrange(args.thread):
        jobs.put(None)
    for t in threads:
        t.join()

    avg = 0.0
    while not qout.empty():
        val = qout.get()
        avg += val
    avg /= sent_num
def main():
    client = InfluxDBClient(host=args.influxdb_host, ssl=args.ssl, verify_ssl=False, port=8086, database=args.database)
    logger = configure_logging('parse_operations')
    with open(args.input_file, 'r', encoding="latin-1") as f:
        line_count = 0
        for chunk in grouper(f, args.batch_size):
            json_points = []
            for line in chunk:
                # zip_longest will backfill any missing values with None, so we need to handle this, otherwise we'll miss the last batch
                line_count += 1
                if line and line.strip().endswith("ms"):
                    values = {}
                    tags = {
                        'project': args.project,
                        'hostname': args.hostname,
                    }
                    try:
                        tags['operation'] = line.split("] ", 1)[1].split()[0]
                    except IndexError as e:
                        logger.error("Unable to get operation type - {} - {}".format(e, line))
                        break
                    if tags['operation'] in ['command', 'query', 'getmore', 'insert', 'update', 'remove', 'aggregate', 'mapreduce']:
                        thread = line.split("[", 1)[1].split("]")[0]
                        # Alternately - print(split_line[3])
                        if tags['operation'] == 'command':
                            tags['command'] = line.split("command: ")[1].split()[0]
                        if "conn" in thread:
                            tags['connection_id'] = thread
                        split_line = line.split()
                        values['duration_in_milliseconds'] = int(split_line[-1].rstrip('ms'))
                        # TODO 2.4.x timestamps have spaces
                        timestamp = parse(split_line[0])
                        if split_line[1].startswith("["):
                            # TODO - Parse locks from 2.6 style loglines
                            # 2.4 Logline:
                            tags['namespace'] = split_line[3]
                            for stat in reversed(split_line):
                                if "ms" in stat:
                                    pass
                                elif ":" in stat:
                                    key, value = stat.split(":", 1)
                                    values[key] = int(value)
                                elif stat == "locks(micros)":
                                    pass
                                else:
                                    break
                        else:
                            # 3.x logline:
                            tags['namespace'] = split_line[5]
                            # TODO - Should we be splitting on "locks:{" instead?
                            pre_locks, locks = line.rsplit("locks:", 1)
                            # Strip duration from locks
                            locks = locks.rsplit(" ", 1)[0]
                            # Add quotation marks around string, so that it is valid JSON
                            locks = re.sub(r"(\w+):", "\"\g<1>\":", locks)
                            locks_document = flatdict.FlatDict(json.loads(locks), delimiter="_")
                            for key, value in locks_document.iteritems():
                                values["locks_{}".format(key)] = int(value)



                            # We work backwards from the end, until we run out of key:value pairs
                            # TODO - Can we assume these are always integers?
                            for stat in reversed(pre_locks.split()):
                                if ":" in stat:
                                    key, value = stat.split(":", 1)
                                    values[key] = int(value)
                                else:
                                    break
                            # TODO - Parse the full query plan for IXSCAN
                            if 'planSummary: ' in line:
                                tags['plan_summary'] = (line.split('planSummary: ', 1)[1].split()[0])
                        json_points.append(create_point(timestamp, "operations", values, tags))
                    else:
                        logger.info("'{}' is not a recognised operation type - not parsing this line ({})".format(tags['operation'], line))
            if json_points:
                # TODO - We shouldn't need to wrap this in try/except - should be handled by retry decorator
                try:
                    # TODO - Have a dry-run mode
                    write_points(logger, client, json_points, line_count)
                    pass
                except Exception as e:
                    logger.error("Retries exceeded. Giving up on this point.")
Exemple #47
0
try:
    with BZ2File(args.dumpfile, 'r') as f:
        parser = parse_wiki.articles(f)

        skip = args.skip
        for i in range(skip):
            parser.next()

        time_preproc = 0
        time_iserv = 0
        last_time = time()
        articles_count = 0
        this_round_count = 0
        processed_articles = skip

        for docgroup in grouper(args.round, parser):

            t1 = time()

            bdata = index_pb.BuilderData()
            round_tokens = set()
            processed = 0

            for doc in docgroup:
                if not doc: break

                (title, ns, sha1, text) = doc

                if ns != '0': continue
                if not text: continue # wtf
                if text[:9].lower() == ('#redirect'): continue
Exemple #48
0
#!/usr/bin/env python
import sys
import utils

if __name__ == "__main__":
    host = sys.argv[1]
    port = int(sys.argv[2])
    collection = sys.argv[3]
    chunk_size = 10000
    added = 0
    for lines in utils.grouper(sys.stdin, chunk_size):
        lines = [x for x in lines if x != None]
        objects = [utils.parse_line(line) for line in lines]
        utils.index_objects(objects, host, port, collection)
        added += chunk_size
        print >>sys.stderr, added
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-num_hidden_units', type=int, default=512)
	parser.add_argument('-num_lstm_layers', type=int, default=2)
	parser.add_argument('-dropout', type=float, default=0.2)
	parser.add_argument('-activation', type=str, default='tanh')
	parser.add_argument('-num_epochs', type=int, default=100)
	parser.add_argument('-model_save_interval', type=int, default=5)
	parser.add_argument('-batch_size', type=int, default=128)
	parser.add_argument('-word_vector', type=str, default='')
	args = parser.parse_args()

	questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines()
	questions_lengths_train = open('../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines()
	answers_train = open('../data/preprocessed/answers_train2014.txt', 'r').read().decode('utf8').splitlines()
	images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines()
	max_answers = 1000
	questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, max_answers)

	print 'Loaded questions, sorting by length...'
	questions_lengths_train, questions_train, answers_train = (list(t) for t in zip(*sorted(zip(questions_lengths_train, questions_train, answers_train))))
	
	#encode the remaining answers
	labelencoder = preprocessing.LabelEncoder()
	labelencoder.fit(answers_train)
	nb_classes = len(list(labelencoder.classes_))
	joblib.dump(labelencoder,'../models/labelencoder.pkl')
	max_len = 30 #25 is max for training, 27 is max for validation
	word_vec_dim = 300

	model = Sequential()
	model.add(LSTM(output_dim = args.num_hidden_units, activation='tanh', 
			return_sequences=True, input_shape=(max_len, word_vec_dim)))
	model.add(Dropout(args.dropout))
	model.add(LSTM(args.num_hidden_units, return_sequences=False))
	model.add(Dense(nb_classes, init='uniform'))
	model.add(Activation('softmax'))

	json_string = model.to_json()
	model_file_name = '../models/lstm_language_only_num_hidden_units_' + str(args.num_hidden_units) + '_num_lstm_layers_' + str(args.num_lstm_layers) + '_dropout_' + str(args.dropout)
	open(model_file_name  + '.json', 'w').write(json_string)
	
	print 'Compiling model...'
	model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
	print 'Compilation done...'

	#set up word vectors
        # Code to choose the word vectors, default is Goldberg but GLOVE is preferred
        if args.word_vector == 'glove':
            nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        else:
            nlp = English()

	print 'loaded ' + args.word_vector + ' word2vec features...'

	## training
        # Moved few variables to args.parser (num_epochs, batch_size, model_save_interval)
	print 'Training started...'
	for k in xrange(args.num_epochs):

		progbar = generic_utils.Progbar(len(questions_train))

		for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[0]), 
												grouper(answers_train, args.batch_size, fillvalue=answers_train[0]), 
												grouper(images_train, args.batch_size, fillvalue=images_train[0])):
			timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length
			X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps)
			Y_batch = get_answers_matrix(an_batch, labelencoder)
			loss = model.train_on_batch(X_q_batch, Y_batch)
			# fix for the Keras v0.3 issue #9
			progbar.add(args.batch_size, values=[("train loss", loss[0])])

		
		if k%args.model_save_interval == 0:
			model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k))

	model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k+1))
Exemple #50
0
    learnset_url += "{0}_(Pok%C3%A9mon)/Generation_I_learnset".format(pname)

    html_file = urllib2.urlopen(learnset_url)
    learnset_html = html_file.read()
    html_file.close()

    bs = BeautifulSoup(learnset_html)
    x = [td.text for td in bs.findAll("td")
         if 0 < len(td.text) < 60]
    # if td.text in movename_to_num.values()] worked pretty well, but...
    # Just grabbing everything that appears anywhere and is a valid move
    # name will grab Psychic, when those characters only appeared to indicate
    # the type of a move and not the Move Psychic

    # So instead, group them into clumps... it seems to group very consistently
    grouped = list(grouper(x, 6))

    # Pikachu had a weird move: Light Screen, which he learns at Level 50 in
    # Pokemon Yellow, but never in Red/Blue. So just grabbing the values in the
    # table that are valid moves would actually lead us to believe that Pikachu
    # can learn Light Screen, which he can, but it doesn't have a TM until Gen
    # 3. Instead, let's group the entries, drop the ones from Pokemon Yellow,
    # and then grab the remaining moves
    not_yellow = [entry for entry in grouped if not entry[0].endswith("Y")]

    # fix a problem that Vaporean == 106 was having
    valid_starts = ("T", "H", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
    valid = [entry for entry in not_yellow
             if entry[0].startswith(valid_starts)]

    moves = [standardize(entry[1]) for entry in valid
Exemple #51
0
    def train(self, sentences, total_words=None, word_count=0, sent_count=0, chunksize=100):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of unicode strings.

        """
        logger.info("training model with %i workers on %i sentences and %i features, "
                    "using 'skipgram'=%s 'hierarchical softmax'=%s 'subsample'=%s and 'negative sampling'=%s" %
                    (self.workers, self.sents_len, self.layer1_size, self.sg, self.hs, self.sample, self.negative))

        if not self.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")

        start, next_report = time.time(), [1.0]
        word_count = [word_count]
        sent_count = [sent_count]
        total_words = total_words or sum(v.count * v.sample_probability for v in itervalues(self.vocab))
        total_sents = self.total_sents #it's now different from self.sents_len
        jobs = Queue(maxsize=2 * self.workers)  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock()  # for shared state (=number of words trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                if self.update_mode == 0:
                    alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
                else:
                    alpha = self.alpha
                job_words = sum(train_sent_vec(self, self.sents[sent_no], sentence, alpha, work, neu1, self.sents_grad[sent_no])
                                for sent_no, sentence in job)
                with lock:
                    word_count[0] += job_words
                    sent_count[0] += chunksize
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info("PROGRESS: at %.2f%% sents, alpha %.05f, %.0f words/s" %
                                    (100.0 * sent_count[0] / total_sents, alpha, word_count[0] / elapsed if elapsed else 0.0))
                        next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        def prepare_sentences():
            for sent_tuple in sentences:
                sentence = sent_tuple[0]
                sent_id  = sent_tuple[1]
                sent_no = self.sent_no_hash[sent_id]
                sampled = [self.vocab.get(word, None) for word in sentence
                           if word in self.vocab and (self.vocab[word].sample_probability >= 1.0 or self.vocab[word].sample_probability >= random.random_sample())]
                yield (sent_no, sampled)

        # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue
        for job_no, job in enumerate(utils.grouper(prepare_sentences(), chunksize)):
            logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
        for _ in xrange(self.workers):
            jobs.put(None)  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        logger.info("training on %i words took %.1fs, %.0f words/s" %
                    (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0))

        return word_count[0]
Exemple #52
0
from utils import grouper

NUM_POKEMON = 151

# Download the html
url = r"http://bulbapedia.bulbagarden.net/"
url += "wiki/List_of_Pokémon_by_index_number_(Generation_I)"
html_file = urllib2.urlopen(url)
html = html_file.read()
html_file.close()

# Parse with BeautifulSoup, grab the types
bs = BeautifulSoup(html)
x = [td.text for td in bs.findAll("td") if 0 < len(td.text) < 60]
pokemon_types = dict()
for data in grouper(x, 5):
    hexx, weirdno, name, type1, type2 = data
    if "Trainer" in name:
        break  # just eyeballed the data to find this break point
    if "Missingno" in name:
        continue  # lots of glitch Pokemon, just skip them
    pokemon_types[name] = (type1, type2)
assert len(pokemon_types) == NUM_POKEMON


# Merge this with existing Pokemon data
basestats = dict()
basestats_file = open("base_stats.csv")
basestats_file.readline()  # skip the header
for line in basestats_file:
    number, name, hp, attack, defense, speed, special = \
Exemple #53
0
for i in range(50):
    actual *= raiz_doceava_de_dos
    frecuencias.append(actual)

TONO = 2
SEMITONO = 1

escala = [TONO, TONO, SEMITONO, TONO, TONO, TONO, SEMITONO]
# Agrandamos la escala para conseguir algunas notas
escala *= 2

notas = [frecuencias[0]]

# Saltar de 2 en 2 intervalos produce una tercera
actual = 0
for grupo in grouper(escala, 2, 0):
    actual += sum(grupo)
    notas.append(frecuencias[actual])


muestras_por_segundo = 44100
duracion = 0.5

muestras_totales = duracion * muestras_por_segundo

muestras = []

for frecuencia in notas:
    ciclos_por_muestra = frecuencia / muestras_por_segundo
    incremento = 2 * math.pi * ciclos_por_muestra
    fase = 0
Exemple #54
0
def main():

	parser = argparse.ArgumentParser()
	parser.add_argument('-model', type=str, required=True)
	parser.add_argument('-weights', type=str, required=True)
	parser.add_argument('-results', type=str, required=True)
	parser.add_argument('-word_vector', type=str, default='')
	args = parser.parse_args()

	model = model_from_json(open(args.model).read())
	model.load_weights(args.weights)
	model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

	questions_val = open('../data/preprocessed/questions_val2014.txt', 
						'r').read().decode('utf8').splitlines()
	questions_lengths_val = open('../data/preprocessed/questions_lengths_val2014.txt', 
								'r').read().decode('utf8').splitlines()
	answers_val = open('../data/preprocessed/answers_val2014_all.txt', 
						'r').read().decode('utf8').splitlines()
	images_val = open('../data/preprocessed/images_val2014.txt', 
						'r').read().decode('utf8').splitlines()
	vgg_model_path = '../features/coco/vgg_feats.mat'
	
	questions_lengths_val, questions_val, answers_val, images_val = (list(t) for t in zip(*sorted(zip(questions_lengths_val, questions_val, answers_val, images_val))))

	print 'Model compiled, weights loaded'
	labelencoder = joblib.load('../models/labelencoder.pkl')

	features_struct = scipy.io.loadmat(vgg_model_path)
	VGGfeatures = features_struct['feats']
	print 'Loaded vgg features'
	image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines()
	img_map = {}
	for ids in image_ids:
		id_split = ids.split()
		img_map[id_split[0]] = int(id_split[1])

	if args.word_vector == 'glove':
            nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        else:
            nlp = English()

	print 'loaded ' + args.word_vector + ' word2vec features...'


	nb_classes = 1000
	y_predict_text = []
	batchSize = 128
	widgets = ['Evaluating ', Percentage(), ' ', Bar(marker='#',left='[',right=']'),
           ' ', ETA()]
	pbar = ProgressBar(widgets=widgets)

	for qu_batch,an_batch,im_batch in pbar(zip(grouper(questions_val, batchSize, fillvalue=questions_val[0]), 
												grouper(answers_val, batchSize, fillvalue=answers_val[0]), 
												grouper(images_val, batchSize, fillvalue=images_val[0]))):
		timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length
		X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps)
		if 'language_only' in args.model:
			X_batch = X_q_batch
		else:
			X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures)
			X_batch = [X_q_batch, X_i_batch]
		y_predict = model.predict_classes(X_batch, verbose=0)
		y_predict_text.extend(labelencoder.inverse_transform(y_predict))

	total = 0
	correct_val=0.0
	f1 = open(args.results, 'w')
	for prediction, truth, question, image in zip(y_predict_text, answers_val, questions_val, images_val):
		temp_count=0
		for _truth in truth.split(';'):
			if prediction == _truth:
				temp_count+=1

		if temp_count>2:
			correct_val+=1
		else:
			correct_val+=float(temp_count)/3

		total+=1

		f1.write(question.encode('utf-8'))
		f1.write('\n')
		f1.write(image.encode('utf-8'))
		f1.write('\n')
		f1.write(prediction)
		f1.write('\n')
		f1.write(truth.encode('utf-8'))
		f1.write('\n')
		f1.write('\n')

	f1.write('Final Accuracy is ' + str(correct_val/total))
	f1.close()
	f1 = open('../results/overall_results.txt', 'a')
	f1.write(args.weights + '\n')
	f1.write(str(correct_val/total) + '\n\n')
	f1.close()
	print 'Final Accuracy on the validation set is', correct_val/total
Exemple #55
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-model', type=str, required=True)
	parser.add_argument('-weights', type=str, required=True)
	parser.add_argument('-results', type=str, required=True)
	args = parser.parse_args()

	model = model_from_json(open(args.model).read())
	model.load_weights(args.weights)
	model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

	questions_val = open('../data/preprocessed/questions_val2014.txt', 
						'r').read().decode('utf8').splitlines()
	answers_val = open('../data/preprocessed/answers_val2014.txt', 
						'r').read().decode('utf8').splitlines()
	images_val = open('../data/preprocessed/images_val2014.txt', 
						'r').read().decode('utf8').splitlines()
	vgg_model_path = '../features/coco/vgg_feats.mat'
	
	print 'Model compiled, weights loaded...'
	labelencoder = joblib.load('../models/labelencoder.pkl')

	features_struct = scipy.io.loadmat(vgg_model_path)
	VGGfeatures = features_struct['feats']
	print 'loaded vgg features'
	image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines()
	img_map = {}
	for ids in image_ids:
		id_split = ids.split()
		img_map[id_split[0]] = int(id_split[1])

	nlp = English()
	print 'loaded word2vec features'

	nb_classes = 1000
	y_predict_text = []
	batchSize = 128
	widgets = ['Evaluating ', Percentage(), ' ', Bar(marker='#',left='[',right=']'),
           ' ', ETA()]
	pbar = ProgressBar(widgets=widgets)

	for qu_batch,an_batch,im_batch in pbar(zip(grouper(questions_val, batchSize, fillvalue=questions_val[0]), 
												grouper(answers_val, batchSize, fillvalue=answers_val[0]), 
												grouper(images_val, batchSize, fillvalue=images_val[0]))):
		X_q_batch = get_questions_matrix_sum(qu_batch, nlp)
		if 'language_only' in args.model:
			X_batch = X_q_batch
		else:
			X_i_batch = get_images_matrix(im_batch, img_map , VGGfeatures)
			X_batch = np.hstack((X_q_batch, X_i_batch))
		y_predict = model.predict_classes(X_batch, verbose=0)
		y_predict_text.extend(labelencoder.inverse_transform(y_predict))

	correct_val=0
	incorrect_val=0	
	f1 = open(args.results, 'w')

	for prediction, truth, question, image in zip(y_predict_text, answers_val, questions_val, images_val):
		temp_count=0
		for _truth in truth.split(';'):
			if prediction == _truth:
				temp_count+=1

		if temp_count>2:
			correct_val+=1
		else:
			incorrect_val+=1

		f1.write(question.encode('utf-8'))
		f1.write('\n')
		f1.write(image.encode('utf-8'))
		f1.write('\n')
		f1.write(prediction)
		f1.write('\n')
		f1.write(truth.encode('utf-8'))
		f1.write('\n')
		f1.write('\n')

	f1.write('Final Accuracy is ' + str(float(correct_val)/(incorrect_val+correct_val)))
	f1.close()
	f1 = open('../results/overall_results.txt', 'a')
	f1.write(args.weights + '\n')
	f1.write(str(float(correct_val)/(incorrect_val+correct_val)) + '\n')
	f1.close()
	print 'Final Accuracy on the validation set is', float(correct_val)/(incorrect_val+correct_val)
def main():
    client = InfluxDBClient(host=args.influxdb_host, ssl=args.ssl, verify_ssl=False, port=8086, database=args.database)
    logger = configure_logging('parse_iostat')
    iostat_timezone = timezone(args.timezone)
    with open(args.input_file, 'r') as f:
        if args.hostname:
            f.__next__() # Skip the "Linux..." line
        else:
            hostname = re.split(r'[()]', f.readline())[1]
        logger.info("Found hostname {}".format(hostname))
        f.__next__() # Skip the blank line
        line_counter = 2
        for chunk_index, chunk in enumerate(grouper(parse_iostat(f), args.batch_size)):
            json_points = []
            for block in chunk:
                if block:
                    try:
                        for i, line in enumerate(block):
                            line_counter += 1
                            if i == 0:
                                timestamp = iostat_timezone.localize(line)
                                # print(timestamp)
                                # import ipdb;ipdb.set_trace()
                                # print("timestamp is {}".format(timestamp))
                                # TODO: Timezone?
                                # TODO: Better way of storing timestamp
                            elif i == 1: # CPU Metric Headings
                                pass
                            elif i==2:
                                system_stats = dict(zip(system_stat_headers, line.split()))
                                values = {}
                                for metric_name, value in system_stats.items():
                                    values[metric_name] = float(value)
                                json_points.append({
                                    "measurement": "iostat",
                                    "tags": {
                                        "project": args.project,
                                        "hostname": hostname
                                    },
                                    "time": timestamp.isoformat(),
                                    "fields": values
                                })
                            elif i==4: # Disk metric headings
                                pass
                            elif i >= 5 and line:
                                disk_stats = {}
                                device = line.split()[0]
                                disk_stats[device] = dict(zip(disk_stat_headers, line.split()[1:]))

                                for disk_name, metrics in disk_stats.items():
                                    values = {}
                                    for metric_name, value in metrics.items():
                                        # Nasty hack to deal with bad data from Morgan Stanley
                                        # if disk_name not in ['sda', 'sdb', 'dm-0', 'dm-1', 'dm-2']:
                                        #     print(block)
                                        #     raise ValueError
                                        values[metric_name] = float(value)
                                    json_points.append({
                                        "measurement": "iostat",
                                        "tags": {
                                            "project": args.project,
                                            "hostname": hostname,
                                            "device": disk_name,
                                        },
                                        "time": timestamp.isoformat(),
                                        "fields": values
                                    })

                    except ValueError as e:
                        print("Bad output seen - skipping")
                        print(e)
                        print(block)
            write_points(logger, client, json_points, line_counter)