Ejemplo n.º 1
0
    def run_evaluate(self, sess, test, tags):
        """
        Evaluates performance on test set
        Args:
            sess: tensorflow session
            test: dataset that yields tuple of sentences, tags
            tags: {tag: index} dictionary
        Returns:
            accuracy
            f1 score
        """
        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for words, labels in minibatches(test, self.config.batch_size):
            labels_pred, sequence_lengths = self.predict_batch(sess, words)

            for lab, lab_pred, length in zip(labels, labels_pred,
                                             sequence_lengths):
                lab = lab[:length]
                lab_pred = lab_pred[:length]
                accs += [a == b for (a, b) in zip(lab, lab_pred)]
                lab_chunks = set(get_chunks(lab, tags, self.config.DEFAULT))
                lab_pred_chunks = set(
                    get_chunks(lab_pred, tags, self.config.DEFAULT))
                correct_preds += len(lab_chunks & lab_pred_chunks)
                total_preds += len(lab_pred_chunks)
                total_correct += len(lab_chunks)

        p = correct_preds / total_preds if correct_preds > 0 else 0
        r = correct_preds / total_correct if correct_preds > 0 else 0
        f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)
        return acc, f1
Ejemplo n.º 2
0
    def evaluate(self, test):
        """
        evaluates performance on test set
        :param test: dataset that yields tuple of (sentences, tags)
        :return: metrics: (dict) metrics['acc'] = 98.4, ...
        """
        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.

        test_data, sequence_lengths = test.get_batch(test.size())
        test_words = [instance[0] for instance in test_data]
        test_labels = [instance[1] for instance in test_data]
        pred_labels, pred_scores = self.predict(test_words, sequence_lengths)

        for lab, lab_pred, length in zip(test_labels, pred_labels,
                                         sequence_lengths):
            lab = lab[:length]
            lab_pred = lab_pred[:length]
            accs += [a == b for (a, b) in zip(lab, lab_pred)]

            lab_chunks = set(utils.get_chunks(lab, self.config.vocab_labels))
            lab_pred_chunks = set(
                utils.get_chunks(lab_pred, self.config.vocab_labels))

            correct_preds += len(lab_chunks & lab_pred_chunks)
            total_preds += len(lab_pred_chunks)
            total_correct += len(lab_chunks)

        p = correct_preds / total_preds if total_preds > 0 else 0
        r = correct_preds / total_correct if total_correct > 0 else 0
        f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)
        score = np.mean(pred_scores)
        return {'score': score, 'acc': acc, 'f1': f1, 'p': p, 'r': r}
Ejemplo n.º 3
0
  def run_evaluate(self, sess, test, tags):
    """
    Evaluates performance on test set
    Args:
      sess: tensorflow session
      test: dataset that yields tuple of sentences, tags
      tags: {tag: index} dictionary
    Returns:
      accuracy
      f1 score
    """
    accs = []
    correct_preds, total_correct, total_preds = 0., 0., 0.
    output_file = codecs.open("output", 'w', 'UTF-8')
    idx_to_tag = {idx: tag for tag, idx in tags.items()}
    for words, labels, iob_gold, mention_type_gold, mentions_gold, word_features, char_features in minibatches(test, self.config.batch_size):
      iob_labels_pred, sequence_lengths= self.predict_iob_batch(sess, words, word_features, char_features)
      mentions = []
      mention_sizes = []
      count = 0
      for i in range(self.config.batch_size):
        length = sequence_lengths[i]
        mention = find_mentions(iob_labels_pred[i][:length])
        mentions.append(mention)
        mention_sizes.append(len(mention))
        if len(mention) == 0:
          count += 1
      if count != self.config.batch_size:
        mentions_pred, _ = self.predict_type_batch(sess, words, word_features, char_features, mentions)
      else:
        mentions_pred = [[]]*self.config.batch_size
   
      for lab, iob_pred, length, mention, mention_pred, mention_size in zip(labels, iob_labels_pred, sequence_lengths, mentions, mentions_pred, mention_sizes):
        lab = lab[:length]
        iob_pred = iob_pred[:length]
        mention_pred = mention_pred[:mention_size]
        
        lab_pred = find_labels(iob_pred, mention_pred, tags, self.id2type)
        accs += [a==b for (a, b) in zip(lab, lab_pred)]
        lab_chunks = set(get_chunks(lab, tags))
        lab_pred_chunks = set(get_chunks(lab_pred, tags))
        correct_preds += len(lab_chunks & lab_pred_chunks)
        total_preds += len(lab_pred_chunks)
        total_correct += len(lab_chunks)
        
        output_string = ""
        for b, c in zip(lab, lab_pred):
          split_line = []
          split_line.append(idx_to_tag[b])
          split_line.append(idx_to_tag[c])
          output_string += ' '.join(split_line) + '\n'
        output_file.write(output_string+'\n')

    p = correct_preds / total_preds if correct_preds > 0 else 0
    r = correct_preds / total_correct if correct_preds > 0 else 0
    f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
    acc = np.mean(accs)
    return acc, f1
def Eval(sess, tagger, data, num_steps, best_eval_metric, name):
    """Evaluates a network and checkpoints it to disk.

  Args:
    sess: tensorflow session to use
    parser: graph builder containing all ops references
    num_steps: number of training steps taken, for logging
    best_eval_metric: current best eval metric, to decide whether this model is
        the best so far

  Returns:
    new best eval metric
  """
    logging.info('Evaluating training network.')
    t = time.time()
    num_epochs = None
    epochs = 0
    logging.info(data.get_sent_num(name))
    epochs, sent_batch = utils.loadBatch(FLAGS.batch_size, epochs, data, name)
    number_of_words = 0
    while True:
        sent_batch, epochs, feature_endpoints, gold_tags, words = utils.get_current_features(
            sent_batch, epochs, data, name)
        predictions, tf_eval_metrics = sess.run(
            [tagger.evaluation['predictions'], tagger.evaluation['logits']],
            feed_dict={tagger.test_input: feature_endpoints})
        utils.set_current_tags(sent_batch, predictions)
        if num_epochs is None:
            num_epochs = epochs
        elif num_epochs < sent_batch[0].get_epoch():
            break
    t_end = time.time()
    data.reset_index(name)
    for sent in sent_batch:
        sent.reset_state()
    accs = []
    correct_preds, total_correct, total_preds = 0., 0., 0.

    while data.has_next_sent(name):
        sent = data.get_next_sent(name)
        words = sent.get_word_list()
        number_of_words += len(words)
        gold_labels = sent.ner_ids
        accs += [a == b for (a, b) in zip(gold_labels, sent.output_tags)]
        lab_chunks = set(utils.get_chunks(gold_labels, data.id2tag))
        lab_pred_chunks = set(utils.get_chunks(sent.output_tags, data.id2tag))
        correct_preds += len(lab_chunks & lab_pred_chunks)
        total_preds += len(lab_pred_chunks)
        total_correct += len(lab_chunks)
    test_time = t_end - t
    p = correct_preds / total_preds if correct_preds > 0 else 0
    r = correct_preds / total_correct if correct_preds > 0 else 0
    f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
    logging.info("f1 score:")
    logging.info(f1)
    logging.info(number_of_words)
    data.reset_index(name)
    return test_time
Ejemplo n.º 5
0
def run_qa():
    client = MongoClient(config.MONGO_IP, config.MONGO_PORT)
    db = client[config.DB]
    wikipedia = db[config.WIKIPEDIA_COLLECTION]
    wikidocs = list(
        wikipedia.find({}, {
            'wikidata_id': 1,
            '_id': 0
        }).sort('wikidata_id'))
    chunks = get_chunks(wikidocs, config.CHUNK_SIZE, 'wikidata_id')
    del wikidocs
    start_time = time.time()
    total = 0

    pool = multiprocessing.Pool(config.NUM_WORKERS)
    for res in pool.imap(qa, chunks):
        total += res['processed']
        res['total'] = total
        part = int(time.time() - start_time)
        res['elapsed'] = compress(res['elapsed'])
        res['total_elapsed'] = compress(part)
        logging.info(
            "Processed {processed} ({total} in total) documents in {elapsed} (running time {"
            "total_elapsed})".format(**res))

    pool.terminate()

    elapsed = int(time.time() - start_time)
    logging.info("Processed {} documents in {}".format(total,
                                                       compress(elapsed)))
    return
    async def do_detect(self, data: pd.DataFrame,
                        cache: Optional[ModelCache]) -> DetectionResult:

        window_size = self._detector.get_window_size(cache)
        chunk_size = window_size * self.CHUNK_WINDOW_SIZE_FACTOR
        chunk_intersection = window_size * self.CHUNK_INTERSECTION_FACTOR

        detections: List[DetectionResult] = []
        chunks = []
        # XXX: get_chunks(data, chunk_size) == get_intersected_chunks(data, 0, chunk_size)
        if self._detector.is_detection_intersected():
            chunks = get_intersected_chunks(data, chunk_intersection,
                                            chunk_size)
        else:
            chunks = get_chunks(data, chunk_size)

        for chunk in chunks:
            await asyncio.sleep(0)
            chunk_dataframe = prepare_data(chunk)
            detected: DetectionResult = self._detector.detect(
                chunk_dataframe, cache)
            detections.append(detected)

        if len(detections) == 0:
            raise RuntimeError(
                f'do_detect for {self.analytic_unit_id} got empty detection results'
            )

        detection_result = self._detector.concat_detection_results(detections)
        return detection_result.to_json()
Ejemplo n.º 7
0
Archivo: graph.py Proyecto: RoboR/DAGs
    def __normalize_treelevels(self):
        """
        Normalize the treelevels so they can be used to generate the tree without
        problems.

        The normalized treelevels must fulfill the condition that at any given
        level the number of nodes of that level must be at least equal (or higher)
        than the number of blocks of the next level. With the exepction of the
        root.
        """
        root = self.treelevels.pop(0)

        while True:
            modified = False
            for x, y in get_chunks(self.treelevels, 2, 1):
                if len(list(chain.from_iterable(x))) < len(y):
                    modified = True
                    # Find the smallest block of y and move it
                    # to the previous level
                    position = 0
                    min_value = float('inf')
                    for pos, value in enumerate(map(len, y)):
                        if min_value < value:
                            position = pos

                    x.append(y[position])
                    y.pop(position)

            if not modified:
                break

        self.treelevels.insert(0, root)
Ejemplo n.º 8
0
    def download_file(self, file_id, file_key, public=False):
        if public:
            file_key = base64_to_a32(file_key)
            file_data = self.api_req({'a': 'g', 'g': 1, 'p': file_id})
        else:
            file_data = self.api_req({'a': 'g', 'g': 1, 'n': file_id})

        k = (file_key[0] ^ file_key[4],
             file_key[1] ^ file_key[5],
             file_key[2] ^ file_key[6],
             file_key[3] ^ file_key[7])
        iv = file_key[4:6] + (0, 0)
        meta_mac = file_key[6:8]

        file_url = file_data['g']
        file_size = file_data['s']
        attributes = base64urldecode(file_data['at'])
        attributes = dec_attr(attributes, k)
        file_name = attributes['n']

        infile = requests.get(file_url, stream=True).raw
        outfile = open(file_name, 'wb')

        counter = Counter.new(
            128, initial_value=((iv[0] << 32) + iv[1]) << 64)
        decryptor = AES.new(a32_to_str(k), AES.MODE_CTR, counter=counter)

        file_mac = (0, 0, 0, 0)
        for chunk_start, chunk_size in sorted(get_chunks(file_size).items()):
            chunk = infile.read(chunk_size)
            chunk = decryptor.decrypt(chunk)
            outfile.write(chunk)

            chunk_mac = [iv[0], iv[1], iv[0], iv[1]]
            for i in range(0, len(chunk), 16):
                block = chunk[i:i+16]
                if len(block) % 16:
                    block += '\0' * (16 - (len(block) % 16))
                block = str_to_a32(block)
                chunk_mac = [
                    chunk_mac[0] ^ block[0],
                    chunk_mac[1] ^ block[1],
                    chunk_mac[2] ^ block[2],
                    chunk_mac[3] ^ block[3]]
                chunk_mac = aes_cbc_encrypt_a32(chunk_mac, k)

            file_mac = [
                file_mac[0] ^ chunk_mac[0],
                file_mac[1] ^ chunk_mac[1],
                file_mac[2] ^ chunk_mac[2],
                file_mac[3] ^ chunk_mac[3]]
            file_mac = aes_cbc_encrypt_a32(file_mac, k)

        outfile.close()

        # Integrity check
        if (file_mac[0] ^ file_mac[1], file_mac[2] ^ file_mac[3]) != meta_mac:
            raise ValueError('MAC mismatch')
def Evaluate(sess, model, dataset, transition_params_trained, parameters,
             epoch_num):

    start = time.time()
    accs = []
    correct_preds, total_correct, total_preds = 0., 0., 0.
    word_count = 0
    while dataset.has_next_sent('test'):
        sent = dataset.get_next_sent('test')
        feed_dict = {
            model.input_token_indices: sent.word_ids,
            model.input_token_character_indices:
            utils.pad_lists(sent.char_lists),
            model.input_token_lengths: sent.word_lengths,
            model.dropout_keep_prob: 1
        }
        unary_scores, predictions = sess.run(
            [model.unary_scores, model.predictions], feed_dict)
        if parameters['use_crf']:
            predictions, _ = tf.contrib.crf.viterbi_decode(
                unary_scores, transition_params_trained)
            predictions = predictions[1:-1]
        gold_labels = sent.ner_ids
        words = sent.word_ids
        word_count += len(words)
        accs += [a == b for (a, b) in zip(gold_labels, predictions)]
        lab_chunks = set(utils.get_chunks(gold_labels, dataset.ner_map))
        lab_pred_chunks = set(utils.get_chunks(predictions, dataset.ner_map))
        #logging.info(sent.ner_ids)
        #logging.info(predictions)
        correct_preds += len(lab_chunks & lab_pred_chunks)
        total_preds += len(lab_pred_chunks)
        total_correct += len(lab_chunks)

    p = correct_preds / total_preds if correct_preds > 0 else 0
    r = correct_preds / total_correct if correct_preds > 0 else 0
    f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0

    test_time = time.time() - start
    dataset.reset_index('test')
    logging.info("epoch: %d, f1 score: %.2f", epoch_num, f1 * 100.0)

    return test_time
Ejemplo n.º 10
0
 def parse_date_string(self, time_frames):
     retval = []
     tmp_list = []
     for word in time_frames:
         if word == '-':
             continue
         tmp_list.append(word)
         if re.search(CLOCK_PATTERN, word):
             retval.append(' '.join(tmp_list))
             tmp_list = []
     return utils.get_chunks(retval, 2)
Ejemplo n.º 11
0
def init():
    for i, fn in enumerate(BACKING_FNs):
        p = Player(i + 1)
        players.append(p)
        p.volume = 1.0
        data_file = wave.open(fn, 'rb')
        data = data_file.readframes(data_file.getnframes())
        sequence_number = 0
        for d in get_chunks(data, FRAMES_PER_PACKET * FRAME_WIDTH):
            p.audio_packets[sequence_number] = d
            sequence_number += 1
Ejemplo n.º 12
0
 def run_infer(self, sess, test, tags):
     """
     Evaluates performance on test set
     Args:
         sess: tensorflow session
         test: dataset that yields tuple of sentences, tags
         tags: {tag: index} dictionary
     Returns:
         accuracy
         f1 score
     """
     infer_res = open(self.config.infer_filename, 'w', encoding="utf-8-sig")
     accs = []
     correct_preds, total_correct, total_preds = 0., 0., 0.
     for words, labels in minibatches(test, self.config.batch_size):
         words_copy = copy.deepcopy(words)
         labels_pred, sequence_lengths = self.predict_batch(sess, words)
         # print("predict_batch", labels_pred, sequence_lengths,words_copy)
         if self.config.chars:
             _, words_res = zip(*words_copy)
         else:
             words_res = words_copy
         for word_res, lab, lab_pred, length in zip(
                 words_res, labels, labels_pred, sequence_lengths):
             lab = lab[:length]
             lab_pred = lab_pred[:length]
             # print("idx_restore", word_res, lab, lab_pred)
             infer_res.write(self.idx_restore(word_res, lab, lab_pred))
             accs += [a == b for (a, b) in zip(lab, lab_pred)]
             lab_chunks = set(get_chunks(lab, tags, self.config.DEFAULT))
             lab_pred_chunks = set(
                 get_chunks(lab_pred, tags, self.config.DEFAULT))
             correct_preds += len(lab_chunks & lab_pred_chunks)
             total_preds += len(lab_pred_chunks)
             total_correct += len(lab_chunks)
     infer_res.close()
     p = correct_preds / total_preds if correct_preds > 0 else 0
     r = correct_preds / total_correct if correct_preds > 0 else 0
     f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
     acc = np.mean(accs)
     return acc, f1
Ejemplo n.º 13
0
    def evaluate(self, dev_x, dev_y):
        """
        Evaluates performance on dev set

        """
        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for i, (x_batch, y_batch) in enumerate(
                next_batch(dev_x, dev_y, self.config.batch_size,
                           shuffle=True)):

            fd, sentence_lengths, label_padded, _ = self.get_fd(
                x_batch, y_batch)

            scores, trans_params = self.sess.run(
                [self.scores, self.trans_params], feed_dict=fd)

            viterbi_sequences = self.viterbi_decode(scores, sentence_lengths,
                                                    trans_params)

            for lab, lab_pred, length in zip(label_padded, viterbi_sequences,
                                             sentence_lengths):
                lab = lab[:length]
                lab_pred = lab_pred[:length]
                accs += [a == b for (a, b) in zip(lab, lab_pred)]

                lab_chunks = set(get_chunks(lab, self.config.idx2label))
                lab_pred_chunks = set(
                    get_chunks(lab_pred, self.config.idx2label))

                correct_preds += len(lab_chunks & lab_pred_chunks)
                total_preds += len(lab_pred_chunks)
                total_correct += len(lab_chunks)

        p = correct_preds / total_preds if correct_preds > 0 else 0
        r = correct_preds / total_correct if correct_preds > 0 else 0
        f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)

        return {"acc": 100 * acc, "f1": 100 * f1}
Ejemplo n.º 14
0
    def test_non_intersected_chunks(self):
        chunk_size = 4

        cases = [(tuple(range(12)), [[0, 1, 2, 3], [4, 5, 6, 7],
                                     [8, 9, 10, 11]]),
                 (tuple(range(9)), [[0, 1, 2, 3], [4, 5, 6, 7], [8]]),
                 (tuple(range(10)), [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9]]),
                 (tuple(range(11)), [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10]]),
                 ([], []), (tuple(range(1)), [[0]]),
                 (tuple(range(4)), [[0, 1, 2, 3]])]

        for tested, expected in cases:
            tested_chunks = list(get_chunks(tested, chunk_size))
            self.assertSequenceEqual(tested_chunks, expected)
Ejemplo n.º 15
0
Archivo: graph.py Proyecto: RoboR/DAGs
    def __generate_treelinks(self):
        """
        Generate links for the current graph that create a tree.

        This function generates the tree_links that will populate
        the links of the graph. The class works in an incremental
        fashion, first the links to create a graph are generated
        and then the tree is turned into a DAG.
        """
        tree_links = []

        # Process the root
        root = self.Position(0, 0, 0)
        allSource = set()
        allDest = set()
        for block, b in enumerate(self.treelevels[1]):
            for position, x in enumerate(b):
                dest = self.Position(1, block, position)
                allDest.add(dest)
                tree_links.append(self.GraphLink(root, dest, 0))

        for level, (x, y) in enumerate(get_chunks(self.treelevels[1:-1], 2),
                                       start=1):
            election_positions = []
            for block, b in enumerate(x):
                for position, _ in enumerate(b):
                    election_positions.append(
                        self.Position(level, block, position))
            shuffle(election_positions)

            for dest_block, block in enumerate(y):
                if not election_positions:
                    print "Error::The tree levels are not normalized"
                    sys.exit(0)

                orig_position = election_positions.pop()
                allSource.add(orig_position)
                for dest_position, node in enumerate(block):
                    dest_position = self.Position(level + 1, dest_block,
                                                  dest_position)
                    tree_links.append(
                        self.GraphLink(orig_position, dest_position, 0))
                    allDest.add(dest_position)

        # Process exit node
        exit_position = self.Position(level + 2, 0, 0)
        for lastNode in allDest.difference(allSource):
            tree_links.append(self.GraphLink(lastNode, exit_position, 0))

        return tree_links
Ejemplo n.º 16
0
def init():
    for i, fn in enumerate(BACKING_FNs):
        p = Player(i+1)
        globals.Players.append(p)
        p.volume = 0.25
        data_file = wave.open(fn, 'rb')
        data = data_file.readframes(data_file.getnframes())
        sequence_number = 0
        for d in get_chunks(data, FRAMES_PER_PACKET * FRAME_WIDTH):
            #TODO: this implies you can't switch order of instruments
            p.audio_packets[sequence_number] = d
            sequence_number += 1
    
    packet = create_audio_packet(-1, create_zeros(FRAMES_PER_PACKET))
    network.output_audio_queue.append(packet)
    def run_evaluate(self, test):
        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.

        for words, labels in batch_yield(test, self.batch_size):

            labels_pred, sequence_lengths = self.predict_batch(words)

            for lab, lab_pred, length in zip(labels, labels_pred,
                                             sequence_lengths):
                lab = lab[:length]
                lab_pred = lab_pred[:length]
                accs += [a == b for (a, b) in zip(lab, lab_pred)]
                lab_chunks = set(get_chunks(lab, self.label2id))
                lab_pred_chunks = set(get_chunks(lab_pred, self.label2id))
                correct_preds += len(lab_chunks & lab_pred_chunks)
                total_preds += len(lab_pred_chunks)
                total_correct += len(lab_chunks)

        p = correct_preds / total_preds if correct_preds > 0 else 0
        r = correct_preds / total_correct if correct_preds > 0 else 0
        f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)
        return acc, p, r, f1
Ejemplo n.º 18
0
def split(args):
    """Compute base background in split and use it in each chunk
    """

    n_peaks = utils.quick_line_count(args.peaks) if args.peaks else 0
    ref_mgr = ReferenceManager(args.reference_path)
    if len(ref_mgr.list_species()) > 1 or n_peaks == 0 or ref_mgr.tss_track is None:
        chunk_def = [{'skip': True}]
        return {'chunks': chunk_def}

    # write rows of each chunk to a new peak file
    mem_in_gb = 4.0
    chunk_def = [{'__mem_gb': mem_in_gb,
                  'skip': False,
                  'chunk_start': chunk[0],
                  'chunk_end': chunk[1]} for chunk in utils.get_chunks(n_peaks, chunks=20)]
    return {'chunks': chunk_def}
Ejemplo n.º 19
0
 def parse_time_frames(self):
     modified_tf = []
     if self.args.time_frames[0].isdigit():
         for chunk in utils.get_chunks(self.args.time_frames, 3):
             if re.search(r'[0-9]+-[0-9]+', ''.join(chunk)):
                 re.compile('[,]')
                 modified_tf.append([
                     int(re.compile('[,]').sub('', chunk[0])),
                     int(re.compile('[,]').sub('', chunk[2]))
                 ])
     else:
         time_frames = self.parse_date_string(self.args.time_frames)
         for tf in time_frames:
             modified_tf.append([
                 utils.parse_date(tf[0]).timestamp(),
                 utils.parse_date(tf[1]).timestamp()
             ])
     return modified_tf
Ejemplo n.º 20
0
def split(args):
    if args.fragments is None:
        return {'chunks': [], 'join': {}}

    # as the fragments file is not sorted by barcodes, we iterate through the files to get a list of ordered bcs
    barcodes = list(
        {bc
         for _, _, _, bc, _ in open_fragment_file(args.fragments)})

    # chunk on barcodes
    barcode_chunks = utils.get_chunks(len(barcodes), 30)
    chunks = []
    for num, bc_chunk in enumerate(barcode_chunks):
        bc_path = martian.make_path('barcode_{}.txt'.format(num))
        with open(bc_path, 'w') as f:
            f.write('\n'.join(barcodes[bc_chunk[0]:bc_chunk[1]]))
        chunks.append({'barcodes': bc_path})
    return {'chunks': chunks, 'join': {'__mem_gb': 16}}
Ejemplo n.º 21
0
def callback(in_data, frame_count, time_info, status):
    global output_start_time
    for i, frames in enumerate(
            get_chunks(in_data, FRAMES_PER_PACKET * FRAME_WIDTH)):
        if output_start_time == 0:
            continue
        sample_gap = 1.0 / FRAME_RATE
        packet_time = time_info['input_buffer_adc_time'] + \
                      (i * sample_gap * FRAMES_PER_PACKET) - \
                      output_start_time
        seq_num = int(packet_time / (FRAMES_PER_PACKET * sample_gap))
        if seq_num >= 0:
            input_queue.append((seq_num, frames))

    out_data = create_zeros(frame_count)
    if len(output_queue):
        if output_start_time == 0:
            output_start_time = time_info['output_buffer_dac_time']
        out_data = output_queue.pop(0)
    return (out_data, pa.paContinue)
Ejemplo n.º 22
0
    async def consume_data(self, data: TimeSeries,
                           cache: Optional[ModelCache]) -> Optional[dict]:
        window_size = self._detector.get_window_size(cache)

        detections: List[DetectionResult] = []

        for chunk in get_chunks(data,
                                window_size * self.CHUNK_WINDOW_SIZE_FACTOR):
            await asyncio.sleep(0)
            chunk_dataframe = prepare_data(chunk)
            detected = self._detector.consume_data(chunk_dataframe, cache)
            if detected is not None:
                detections.append(detected)

        if len(detections) == 0:
            return None
        else:
            detection_result = self._detector.concat_detection_results(
                detections)
            return detection_result.to_json()
Ejemplo n.º 23
0
    def build(self):
        result = self.data_base.reverse_index.delete_many({})
        print(result.deleted_count)
        result = self.data_base.user_length.delete_many({})
        print(result.deleted_count)

        for user in tqdm.tqdm(self.data_base.forward_index.find(),
                              total=self.data_base.forward_index.count()):
            splitted = user['text'].split()
            self.user_length[user['uid']] = len(splitted)
            [self.update_reverse(token, user['uid']) for token in splitted]

        subprocess.run('sudo service mongod stop'.split())
        time.sleep(2)
        subprocess.run('sudo service mongod start'.split())
        time.sleep(2)

        token_chunks = get_chunks(list(self.reverse_index.keys()),
                                  ReverseIndex.CHUNK_SIZE)

        for chunk in tqdm.tqdm(token_chunks, total=len(token_chunks)):
            local_index = dict()
            for token in chunk:
                local_index[token] = list(
                    zip(self.reverse_index[token].keys(),
                        self.reverse_index[token].values()))

            chunk_insertion = [{
                'token': token,
                'uids_freqs': local_index[token]
            } for token in local_index]
            self.data_base.reverse_index.insert_many(chunk_insertion)

        length_insertions = [{
            'uid': uid,
            'length': self.user_length[uid]
        } for uid in self.user_length]
        self.data_base.user_length.insert_many(length_insertions)
Ejemplo n.º 24
0
    async def process_data(self, data: TimeSeries, cache: ModelCache) -> dict:
        assert isinstance(self._detector, detectors.ProcessingDetector), \
            f'{self.analytic_unit_id} detector is not ProcessingDetector, can`t process data'
        assert cache is not None, f'{self.analytic_unit_id} got empty cache for processing data'

        processed_chunks = []
        window_size = self._detector.get_window_size(cache)
        for chunk in get_chunks(data,
                                window_size * self.CHUNK_WINDOW_SIZE_FACTOR):
            await asyncio.sleep(0)
            chunk_dataframe = prepare_data(chunk)
            processed = self._detector.process_data(chunk_dataframe, cache)
            if processed is not None:
                processed_chunks.append(processed)

        if len(processed_chunks) == 0:
            raise RuntimeError(
                f'process_data for {self.analytic_unit_id} got empty processing results'
            )

        # TODO: maybe we should process all chunks inside of detector?
        result = self._detector.concat_processing_results(processed_chunks)
        return result.to_json()
Ejemplo n.º 25
0
Archivo: graph.py Proyecto: RoboR/DAGs
    def __generate_treelevels(self, root, exitNode, nodelists, depth):
        """
        Generate the levels of the the tree using the nodelists.

        root -> root of the tree.
        nodelists -> A list of lists containing nodes.
        depth -> The depth of the tree

        Return a list of lists.
        """
        res = [[[root]], [nodelists[0]]]

        if depth <= 2:
            depth = 3

        lists_per_level = (len(nodelists) - 1) / (depth - 2)
        if lists_per_level <= 0:
            print "Warning::The specified depth is too big"
            self.valid_graph = False
            lists_per_level = 1

        return res + list(
            get_chunks(nodelists[1:], lists_per_level,
                       lists_per_level)) + [[[exitNode]]]
Ejemplo n.º 26
0
def main():

    parser = argparse.ArgumentParser(
        description='Calculate QoE and error for PanoSalNet algorithm')

    parser.add_argument('-D',
                        '--dataset',
                        type=int,
                        required=True,
                        help='Dataset ID (1 or 2)')
    parser.add_argument('-T',
                        '--topic',
                        required=True,
                        help='Topic in the particular Dataset (video name)')
    parser.add_argument('--fps',
                        type=int,
                        required=True,
                        help='fps of the video')
    parser.add_argument(
        '-Q',
        '--quality',
        required=True,
        help=
        'Preferred bitrate quality of the video (360p, 480p, 720p, 1080p, 1440p)'
    )

    args = parser.parse_args()

    if args.dataset != 1 and args.dataset != 2:
        print("Incorrect value of the Dataset ID provided!!...")
        print("======= EXIT ===========")
        exit()

    # Get the necessary information regarding the dimensions of the video
    print("Reading JSON...")
    file = open('./meta.json', )
    jsonRead = json.load(file)

    nusers = jsonRead["dataset"][args.dataset - 1]["nusers"]
    width = jsonRead["dataset"][args.dataset - 1]["width"]
    height = jsonRead["dataset"][args.dataset - 1]["height"]
    view_width = jsonRead["dataset"][args.dataset - 1]["view_width"]
    view_height = jsonRead["dataset"][args.dataset - 1]["view_height"]
    milisec = jsonRead["dataset"][args.dataset - 1]["milisec"]

    pref_bitrate = jsonRead["bitrates"][args.quality]
    ncol_tiles = jsonRead["ncol_tiles"]
    nrow_tiles = jsonRead["nrow_tiles"]
    player_width = jsonRead["player_width"]
    player_height = jsonRead["player_height"]

    player_tiles_x = math.ceil(player_width * ncol_tiles * 1.0 / width)
    player_tiles_y = math.ceil(player_height * nrow_tiles * 1.0 / height)

    PATH_ACT = '../../Viewport/ds{}/'.format(args.dataset)
    PATH_PRED = './head_prediction/ds{}/'.format(args.dataset)

    manhattan_error, x_mae, y_mae, final_qoe = [], [], [], []
    count_frames = 0
    for usernum in range(nusers):
        print('User_{}'.format(usernum))
        user_manhattan_error = 0.

        viewport = pickle.load(open(
            PATH_ACT +
            "viewport_ds{}_topic{}_user{}".format(dataset, topic, usernum + 1),
            "rb"),
                               encoding='latin1')
        p_viewport = pickle.load(open(
            PATH_PRED + "topic{}_user{}".format(topic, usernum), "rb"),
                                 encoding="latin1")

        frame_nos = []
        act_viewport, frame_nos, max_frame = get_act_tiles(
            viewport, frame_nos, args.fps, args.milisec, width, height,
            view_width, view_height)

        # Predicted Tile = max of the probabilities in output
        pred_max_viewport = []
        for fr in range(len(p_viewport)):
            prob = p_viewport[fr]
            argmax = np.where(prob == prob.max())
            pred_max_viewport.append((argmax[0][0], argmax[1][0]))

        # Assert len(actual frames) = len(predicted frames)
        pred_viewport = p_viewport
        act_viewport = act_viewport[:len(pred_viewport)]
        frame_nos = frame_nos[:len(pred_viewport)]

        pred_viewport = pred_viewport[:len(act_viewport)]
        frame_nos = frame_nos[:len(pred_viewport)]

        # Calculate Manhattan Error
        for fr in range(len(pred_max_viewport)):
            act_tile = act_viewport[fr]
            pred_tile = pred_max_viewport[fr]

            # Get corrected error
            tile_col_dif = ncol_tiles
            tile_row_dif = act_tile[0] - pred_tile[0]
            tile_col_dif = min(
                pred_tile[1] - act_tile[1], act_tile[1] + ncol_tiles -
                pred_tile[1]) if act_tile[1] < pred_tile[1] else min(
                    act_tile[1] - pred_tile[1], ncol_tiles + pred_tile[1] -
                    act_tile[1])

            current_tile_error = abs(tile_row_dif) + abs(tile_col_dif)
            user_manhattan_error += current_tile_error

        manhattan_error.append(user_manhattan_error / len(pred_max_viewport))
        count_frames += len(act_viewport)

        act_tiles, pred_tiles, chunk_frames = get_chunks(
            act_viewport, pred_viewport, frame_nos, max_frame, args.fps)

        # Allocate bitrate
        vid_bitrate = alloc_bitrate(pred_tiles, chunk_frames, nrow_tiles,
                                    ncol_tiles, pref_bitrate)

        # Calculate QoE
        q = calc_qoe(vid_bitrate, act_tiles, frame_nos, chunk_frames, width,
                     height, nrow_tiles, ncol_tiles, player_width,
                     player_height)
        final_qoe.append(q)

    avg_qoe = np.mean(final_qoe)
    avg_manhattan_error = np.mean(manhattan_error)

    #Print averaged results
    print("\n======= RESULTS ============")
    print('PanoSalNet')
    print('Dataset: {}'.format(args.dataset))
    print('Topic: ' + args.topic)
    print('Pred_nframe: {}'.format(args.fps))
    print('Avg. QoE: {}'.format(avg_qoe))
    print('Avg. Manhattan error: {}'.format(avg_manhattan_error))
    print('Count: {}'.format(count_frames))
    print('\n\n')
Ejemplo n.º 27
0
import utils

if __name__ == '__main__':

    accs = []
    correct_preds, total_correct, total_preds = 0., 0., 0.
    file_input = codecs.open('005_test.txt', 'r', 'UTF-8')
    format_len = 8
    predictions = []
    gold_labels = []
    count = 0
    for cur_line in file_input:
        cur_line = cur_line.strip()
        entity = cur_line.split()
        if len(entity) == format_len:
            predictions.append(entity[-2])
            gold_labels.append(entity[-1])
        else:
            lab_chunks = set(utils.get_chunks(gold_labels))
            lab_pred_chunks = set(utils.get_chunks(predictions))
            correct_preds += len(lab_chunks & lab_pred_chunks)
            total_preds += len(lab_pred_chunks)
            total_correct += len(lab_chunks)
            gold_labels = []
            predictions = []
    p = correct_preds / total_preds if correct_preds > 0 else 0
    r = correct_preds / total_correct if correct_preds > 0 else 0
    f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
    print f1
    file_input.close()
Ejemplo n.º 28
0
                        required=True,
                        choices=['train', 'valid', 'test'],
                        help="train/valid/test data?")
    args = parser.parse_args()

    assert os.path.exists(args.src_fname), "file {} not found".format(
        args.src_fname)
    assert os.path.exists(args.ref_fname), "file {} not found".format(
        args.ref_fname)

    src_lines = read_file(args.src_fname)
    ref_lines = read_file(args.ref_fname)
    assert len(src_lines) == len(ref_lines), \
        "src has {} lines but ref has {} lines".format(len(src_lines), len(ref_lines))

    src_lines_chunked = get_chunks(src_lines, args.num_splits)
    ref_lines_chunked = get_chunks(ref_lines, args.num_splits)

    with parallel_backend('multiprocessing', n_jobs=args.num_splits):
        Parallel()(delayed(write_to_presumm_format)
                   (chunk_idx, src_chunk, ref_chunk, args.split,
                    args.presumm_out, args.output_name)
                   for chunk_idx, (src_chunk, ref_chunk) in enumerate(
                       zip(src_lines_chunked, ref_lines_chunked)))
"""
 # To run
    
    export BASE_DATA_PATH=/projects/tir5/users/aashfaq/Capstone/data/genetic/combined
    export DATA_DIR=$BASE_DATA_PATH/bertsum_data_train/
    mkdir $DATA_DIR
    python -m write_to_presumm_format -src_fname $BASE_DATA_PATH/train.ext -ref_fname $BASE_DATA_PATH/train.target -presumm_out $DATA_DIR -num_splits 20 -split train
Ejemplo n.º 29
0
def add_malware():
    '''
    Adds a sample to the repository. Performs hashing and filemagic
    analysis of the uploaded sample.

    @tags : comma seperated tags list
    @file : binary sample stream

    returns : JSON status message 
    '''

    try:
        with timeout(Config().api.timeout * 60, exception=RuntimeError):
            tags = request.forms.get('tags').split(',')
            data = request.files.file
            data.file.seek(0)

            filename = data.filename
            sampleData = data.file.read()

            logging.debug('[%s] Generating hashes' % sampleEntry)
            md5 = hashlib.md5(sampleData).hexdigest()
            sha1 = hashlib.sha1(sampleData).hexdigest()
            sha256 = hashlib.sha256(sampleData).hexdigest()
            sha512 = hashlib.sha512(sampleData).hexdigest()
            filetype = get_type(sampleData)

            key = {'md5': md5}

            logging.debug('Quering database for already existing file (hash=%s)'
                           % md5)
            existing = db.fs.files.find_one({'md5': md5})

            upload_sample = True
            if existing:
                logging.info('Sample already exists')
                logging.info('Verifying contents')
                if not md5 == existing['md5']:
                    logging.warning('Checksum not matching')
                    upload_sample = True
                else:
                    logging.info('Checksum matching')
                    upload_sample = False
            else:
                upload_sample = True

            if upload_sample:
                logging.debug('Uploading sample')
                new = fs.new_file(filename=filename, sha1=sha1,
                                  sha256=sha256, sha512=sha512,
                                  filetype=filetype)
                for chunk in get_chunks(sampleData):
                    logging.debug('writing chunk')
                    new.write(chunk)
                new.close()
                logging.info('Uploaded sample')

            add_tags(key=key, tags=tags)

            logging.debug('Reclaiming memory')
            del sampleData

            response.content_type = 'application/json'
            return jsonize({'message': 'added'})
    except RuntimeError:
        response.content_type = 'application/json'
        return (jsonize({'error': 'timeout'}), 504)
def Eval(sess):
    """Builds and evaluates a network."""
    logging.set_verbosity(logging.INFO)
    #bpe = BPE(codecs.open("code-file", encoding='utf-8'), "@@")
    wordMapPath = "ner_word_index"
    nerMapPath = "ner_index"
    pMapPath = "ner_prefix_index"
    sMapPath = "ner_suffix_index"

    prefix2id = utils.read_pickle_file(pMapPath)
    suffix2id = utils.read_pickle_file(sMapPath)
    word2id = utils.read_pickle_file(wordMapPath)
    tag2id = utils.read_pickle_file(nerMapPath)

    loading_time = time.time()
    logging.info("loading data and precomputing features...")
    dataset = Dataset(None,
                      None,
                      FLAGS.test_corpus,
                      format_list=['FORM', 'NER'])
    dataset.load_dataset(word2id=word2id,
                         tag2id=tag2id,
                         prefix2id=prefix2id,
                         suffix2id=suffix2id,
                         fgen=False)
    name = 'test'

    logging.info('training sentences: %d', dataset.get_sent_num(name))
    logging.info("logging time: %.2f", time.time() - loading_time)

    if FLAGS.word_only:
        feature_sizes = [8]
        domain_sizes = [dataset.vocabulary_size]
        embedding_dims = [100]
    else:
        feature_sizes = [
            8, 8, 2, 8, 8, 4
        ]  #num of features for each feature group: capitalization, words, prefix_2, suffix_2, tags_history
        domain_sizes = [
            dataset.vocabulary_size, 3, 3, dataset.prefix_size,
            dataset.suffix_size, dataset.number_of_classes + 1
        ]
        embedding_dims = [100, 8, 8, 50, 50, 50]
    num_actions = dataset.number_of_classes
    hidden_layer_sizes = map(int, FLAGS.hidden_layer_sizes.split(','))
    tagger = GreedyTagger(num_actions,
                          feature_sizes,
                          domain_sizes,
                          embedding_dims,
                          hidden_layer_sizes,
                          gate_gradients=True)

    tagger.AddEvaluation(FLAGS.batch_size)
    tagger.AddSaver()
    sess.run(tagger.inits.values())
    tagger.saver.restore(sess, FLAGS.model_path)
    logging.info('Evaluating training network.')
    t = time.time()
    num_epochs = None
    epochs = 0
    epochs, sent_batch = utils.loadBatch(FLAGS.batch_size, epochs, dataset,
                                         name)
    number_of_words = 0
    while True:
        sent_batch, epochs, feature_endpoints, gold_tags, words = utils.get_current_features(
            sent_batch, epochs, dataset, name, FLAGS.word_only)
        predictions, tf_eval_metrics = sess.run(
            [tagger.evaluation['predictions'], tagger.evaluation['logits']],
            feed_dict={tagger.test_input: feature_endpoints})
        utils.set_current_tags(sent_batch, predictions)
        if num_epochs is None:
            num_epochs = epochs
        elif num_epochs < sent_batch[0].get_epoch():
            break
    t_end = time.time()
    dataset.reset_index(name)
    for sent in sent_batch:
        sent.reset_state()
    accs = []
    correct_preds, total_correct, total_preds = 0., 0., 0.
    output_file = codecs.open(FLAGS.output_path, 'w', 'UTF-8')
    while dataset.has_next_sent(name):
        sent = dataset.get_next_sent(name)
        words = sent.get_word_list()
        number_of_words += len(words)
        gold_labels = sent.ner_ids
        accs += [a == b for (a, b) in zip(gold_labels, sent.output_tags)]
        lab_chunks = set(utils.get_chunks(gold_labels, dataset.id2tag))
        lab_pred_chunks = set(
            utils.get_chunks(sent.output_tags, dataset.id2tag))
        correct_preds += len(lab_chunks & lab_pred_chunks)
        total_preds += len(lab_pred_chunks)
        total_correct += len(lab_chunks)
        output_string = ""
        for a, b, c in zip(words, gold_labels, sent.output_tags):
            split_line = [a, dataset.id2tag[b], dataset.id2tag[c]]
            output_string += ' '.join(split_line) + '\n'
        output_file.write(output_string + '\n')

    test_time = t_end - t
    output_file.close()
    p = correct_preds / total_preds if correct_preds > 0 else 0
    r = correct_preds / total_correct if correct_preds > 0 else 0
    f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
    logging.info("f1 score: %.2f; number of words: %d", f1, number_of_words)
Ejemplo n.º 31
0
    def evaluate(self, test, is_test_set=False):

        x_test = test[0]
        y_test = test[1]
        accs = []
        wrong_predictions = []
        lab_c = []
        pred_c = []
        """
        wrong_predictions is a list of tuples of type (sentence, fp_set, fn_set, lab, lab_pred)
        """
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for sentences_batch, labels_batch in get_minibatch(
            (x_test, y_test), self.config.batch_size):
            labels_pred_batch, sequence_lengths_batch = self.predict_batch(
                sentences_batch)
            sentence_index = 0
            for lab, lab_pred, length in zip(labels_batch, labels_pred_batch,
                                             sequence_lengths_batch):
                lab = lab[:length]
                lab_pred = lab_pred[:length]
                accs += [a == b for (a, b) in zip(lab, lab_pred)]

                lab_chunks = set(
                    get_chunks(lab, self.config.vocab_tags, self.config))
                lab_pred_chunks = set(
                    get_chunks(lab_pred, self.config.vocab_tags, self.config))

                correct_preds += len(lab_chunks & lab_pred_chunks)
                total_preds += len(lab_pred_chunks)
                total_correct += len(lab_chunks)

                lab_c.append(lab)
                pred_c.append(lab_pred)

                fp_preds = lab_pred_chunks - lab_chunks
                fn_preds = lab_chunks - lab_pred_chunks

                if is_test_set and (len(fp_preds) != 0 or len(fn_preds) != 0):
                    wrong_pred = (sentences_batch[sentence_index], fp_preds,
                                  fn_preds, lab, lab_pred)
                    # print len(fp_preds) + len(lab_chunks & lab_pred_chunks)
                    # print len(fn_preds) + len(lab_chunks & lab_pred_chunks)
                    # print len(lab_pred_chunks)
                    # print len(lab_chunks)
                    # print fp_preds
                    # print fn_preds
                    wrong_predictions.append(wrong_pred)
                sentence_index += 1

        p = correct_preds / total_preds if correct_preds > 0 else 0
        r = correct_preds / total_correct if correct_preds > 0 else 0
        f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)
        '''
        print "Correct: " + str(correct_preds)
        print "Total Pred: " + str(total_preds)
        print "Total Correct: " + str(total_correct)
        '''
        if is_test_set:
            print "Precision: " + str(p)
            print "Recall: " + str(r)
            print "F1: " + str(f1)

        #if is_test_set:
        #    write_wrong_predictions_to_file(wrong_predictions, self.config)
        pdir = self.config.dir_model
        import pickle
        pickle.dump(lab_c,
                    open(pdir + 'lab_.pkl', 'w'),
                    protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(pred_c,
                    open(pdir + 'pred_.pkl', 'w'),
                    protocol=pickle.HIGHEST_PROTOCOL)

        return {
            "acc": 100 * acc,
            "f1": 100 * f1,
            "Precision": 100 * p,
            "Recall": 100 * r
        }
Ejemplo n.º 32
0
for i in range(num_epochs):
    """ Set variables to zero """
    batch_losses, counter, batch_accuracy = 0, 0, 0
    recall, precision, val_recall, val_precision = 0, 0, 0, 0
    epoch_losses, epoch_counter, epoch_accuracy = 0, 0, 0

    for data in dataloader:
        """ Training """
        net.train()
        targets = data['training']['labels']
        inputs = data['training']['sequence']

        for k in num_batches:
            """ Get truncated steps """
            x, t = get_chunks(inputs, targets, k, mini_batch)

            if torch.sum(t) > 0 or np.random.uniform(0, 1) < 0.1:
                outputs = net(x)
                pred = torch.max(outputs, dim=1)[1].data.numpy()[0, :]
                ground = t.data.numpy()[0, :]

                optimizer.zero_grad()
                loss = loss_fn(outputs, t)

                loss.backward()
                optimizer.step()

                batch_losses += loss.data.numpy()
                batch_accuracy += f1_score(ground, pred, average='macro')
                recall += recall_score(ground, pred, average='macro')