def unfold_tuple_to_columns(series_or_df_with_tuple_column,
                            new_column_names=None,
                            column_name=None):
    """ Unfolds a column `column_name` with tuples and adds the unfolded series as new columns.
    Original column of tuples unchanged.
    Or dict.
    """
    to_unfold = series_or_df_with_tuple_column

    if isinstance(series_or_df_with_tuple_column, pd.DataFrame):
        assert column_name is not None
        to_unfold = series_or_df_with_tuple_column[column_name]

    # was too slow
    # unfolded_cols = to_unfold.apply(pd.Series)

    if isinstance(to_unfold.iloc[0], dict):
        new_column_names = new_column_names if new_column_names is not None else list(
            to_unfold.iloc[0].keys())
        data = list(
            map(list,
                more_itertools.unzip((tuple(d.values()) for d in to_unfold))))
    else:
        # tuple or list
        new_column_names = new_column_names if new_column_names is not None else to_unfold.name
        data = list(map(list, more_itertools.unzip(to_unfold)))

    data = dict(zip(new_column_names, data))

    return pd.DataFrame(series_or_df_with_tuple_column).assign(**data)
Exemple #2
0
def main():
    args = parse_arguments()
    content = [x.strip() for x in args.query if x.strip()]

    if args.sep != "none":
        sep = None if args.sep == "space" else args.sep
        qnos, queries = unzip(line.split(sep, maxsplit=1) for line in content)
    else:
        queries = content
        qnos = list(map(str, range(len(queries))))

    trans = str.maketrans("", "", string.punctuation)
    queries = [s.translate(trans) for s in queries]

    qnos = list(qnos)
    queries = list(queries)

    indri = IndriRunQuery(None, str(args.index.resolve()), args.scheduler)

    if args.scheduler:
        output = indri.run_distributed(qnos, queries, extra={"count": args.count})
    else:
        output = indri.run_batch(
            qnos,
            queries,
            working_set=[],
            extra={"count": args.count},
            workers=args.workers,
        )

    args.output.writelines(output)
Exemple #3
0
def run_model(model_prefix: str, model_epoch: int, config: BasicConfig, data_df: pd.DataFrame, save_path: Path):
    use_gpu = len(config.gpus) > 0
    if use_gpu:
        ctx = [mx.gpu(cur_idx) for cur_idx in config.gpus]
    else:
        ctx = [mx.cpu()]
    sym, args, auxs = mx.model.load_checkpoint(model_prefix, model_epoch)
    model = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
    data_shape = (1, 3, 112, 112)
    model.bind(data_shapes=[('data', data_shape)], for_training=False)
    model.set_params(args, auxs)
    dataset = InfoDataset(data_df, filter_fn=config.filter_fn, augs=config.test_augmentations)
    data = DataLoader(
        dataset,
        batch_size=config.batch_size,
        shuffle=False,
        sampler=None,
        num_workers=config.num_workers,
        pin_memory=use_gpu
    )
    predictions = []
    all_paths, labels = unzip(dataset.data)
    for i, batch in tqdm(enumerate(data), total=len(data)):
        data = mx.gluon.utils.split_and_load(batch[0], ctx_list=ctx, even_split=False)
        batch = mx.io.DataBatch(data)
        model.forward(batch, is_train=False)
        predictions.append(model.get_outputs()[0].asnumpy())
    predictions = np.concatenate(predictions, axis=0)
    labels = np.array(list(labels))
    all_paths = list(all_paths)
    np.savez(str(save_path), paths=all_paths, labels=labels, preds=predictions)
    return all_paths, labels, predictions
Exemple #4
0
    def collate_fn(inputs):
        (input_ids, token_type_ids, attention_mask,
         positions, widths, boundary_pairs,
         options, targets) = map(list, unzip(inputs))

        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
        token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0)
        attn_masks = pad_sequence(attention_mask, batch_first=True, padding_value=0)

        width_max = max(widths)
        context_width_max = max(
            [left_end - left_start + right_end - right_start for ((left_start, left_end), (right_start, right_end)) in
             boundary_pairs])
        gather_index = torch.arange(0, width_max, dtype=torch.long).unsqueeze(0).repeat(len(inputs), 1).clone()
        context_gather_index = torch.arange(0, context_width_max, dtype=torch.long).unsqueeze(0).repeat(len(inputs),
                                                                                                        1).clone()
        for i, (p, w, ((left_start, left_end), (right_start, right_end))) in enumerate(
                zip(positions, widths, boundary_pairs)):
            gather_index.data[i, :w] = torch.arange(p, p + w, dtype=torch.long).data

            cw = left_end - left_start + right_end - right_start
            context_gather_index.data[i, :cw] = torch.cat([torch.arange(left_start, left_end, dtype=torch.long),
                                                           torch.arange(right_start, right_end, dtype=torch.long)]).data

        batch = {'input_ids': input_ids,
                 'token_type_ids': token_type_ids,
                 'attention_mask': attn_masks,
                 'gather_index': gather_index,
                 'context_gather_index': context_gather_index,
                 'positions': torch.tensor(positions).long(),
                 'option_ids': torch.tensor(options).long(),
                 'targets': torch.tensor(targets).long()}
        return batch
Exemple #5
0
    def _transform_unidify(
        self, results_dir: Path, twitter_api_settings: TwitterApiSettings,
    ) -> Counter[_ExecuteResult]:
        result_counter = Counter[_ExecuteResult]()

        head, entries_tweet_ids = spy(
            self._iter_entries_tweet_ids(results_dir, result_counter)
        )
        if not head:  # Check if any entries with Tweet-IDs exist (else unzip fails).
            return result_counter

        entries, tweet_ids = cast(
            Tuple[Iterator[BatchEntry], Iterator[TweetId]], unzip(entries_tweet_ids)
        )
        for entry, tweets in groupby_transform(
            zip(entries, statuses_lookup(tweet_ids, twitter_api_settings)),
            keyfunc=itemgetter(0),
            valuefunc=itemgetter(1),
        ):
            write_jsonl_lines(
                results_dir / entry.data_file_name,
                (tweet for tweet in tweets if tweet is not None),
                use_lzma=True,
            )
            write_json(
                results_dir / entry.meta_file_name, entry, overwrite_existing=True
            )
            result_counter[_ExecuteResult.SUCCESS] += 1

        return result_counter
Exemple #6
0
 def run(self, ds: stream.DataStream) -> stream.DataStream:
     raw_topics_scores_ds = super().run(ds)
     topics_with_ctx = self._get_topic_per_item(raw_topics_scores_ds)
     topics, ctxs = more_itertools.unzip(topics_with_ctx)
     return stream.DataStream(items=topics,
                              applied_ops=ds.applied_ops + [self],
                              context=ctxs)
Exemple #7
0
    def collate_fn(inputs):
        (input_ids, token_type_ids, attention_mask, positions, widths, options,
         targets) = map(list, unzip(inputs))

        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
        token_type_ids = pad_sequence(token_type_ids,
                                      batch_first=True,
                                      padding_value=0)
        attn_masks = pad_sequence(attention_mask,
                                  batch_first=True,
                                  padding_value=0)

        width_max = max(widths)
        gather_index = torch.arange(0, width_max,
                                    dtype=torch.long).unsqueeze(0).repeat(
                                        len(inputs), 1).clone()
        for i, (p, w) in enumerate(zip(positions, widths)):
            gather_index.data[i, :w] = torch.arange(p, p + w,
                                                    dtype=torch.long).data

        batch = {
            'input_ids': input_ids,
            'token_type_ids': token_type_ids,
            'attention_mask': attn_masks,
            'gather_index': gather_index,
            'positions': torch.tensor(positions).long(),
            'option_ids': torch.tensor(options).long(),
            'targets': torch.tensor(targets).long()
        }
        return batch
Exemple #8
0
    def get_docs_stream(self, ds: DataStream) -> DataStream:
        """Returns DataStream of spacy Docs.
        If the data stream already contains spacy Docs then they
        are returned as-is otherwise the nlp object is used to
        create spacy Docs

        Parameters
        ----------
        ds : DataStream
            input data stream

        Returns
        ------
        out : DataStream
            A datastream containing an iterable of spacy's `Doc` objects
        """
        if ds.item_type != Doc:
            docs_with_context = self.nlp.pipe(
                zip(ds, ds.context),
                as_tuples=True,
                n_process=config.ALLOCATED_PROCESSOR_FOR_SPACY,
            )
            new_docs, context = more_itertools.unzip(docs_with_context)
            return DataStream(items=new_docs,
                              applied_ops=ds.applied_ops,
                              context=context)
        else:
            return ds
Exemple #9
0
def initialize_data_from_leavesdb(dataset_name='PNAS',
                                  splits={'train':0.7,'validation':0.3},
                                  threshold=50,
                                  exclude_classes=[],
                                  include_classes=[]):
    datasets = {
            'PNAS': pnas_dataset.PNASDataset(),
            'Leaves': leaves_dataset.LeavesDataset(),
            'Fossil': fossil_dataset.FossilDataset()
            }
    data_files = datasets[dataset_name]

    data_files.exclude_rare_classes(threshold=threshold)
    encoder = base_dataset.LabelEncoder(data_files.classes)
    classes = list((set(encoder.classes)-set(exclude_classes)).union(set(include_classes)))
    data_files, excluded_data_files = data_files.enforce_class_whitelist(class_names=classes)

    x = list(data_files.data['path'].values)
    y = np.array(encoder.encode(data_files.data['family']))

    shuffled_data = list(zip(x,y))
    random.shuffle(shuffled_data)
    partitioned_data = partition_data(data=shuffled_data,
                                      partitions=OrderedDict(splits))
    split_data = {k:v for k,v in partitioned_data.items() if len(v)>0}

    for subset, subset_data in split_data.items():
        split_data[subset] = [list(i) for i in unzip(subset_data)]

    return split_data, data_files, excluded_data_files
Exemple #10
0
 def evolve_local(self,
                  num_gen: int,
                  overwrite: bool = True,
                  callbacks: t.Optional[t.List[Callback]] = None):
     """
     Evolve populations sequentially, using a single processor.
     No early stopping applied.
     Exists mainly for testing purposes.
     """
     data_to_evolve = zip(self.populations, self.records
                          or [None] * len(self.populations))
     operators = self.ops
     results = []
     for individuals, records in tqdm(data_to_evolve,
                                      total=len(self.populations),
                                      desc='Evolving population'):
         individuals, records = self._evolver.evolve(
             num_gen, self.ops, self.genetic_params.Population_size,
             individuals, records)
         if callbacks:
             individuals, records, operators = self._evolver.call_callbacks(
                 callbacks, individuals, records, operators)
         results.append((individuals, records, callbacks))
     populations, records, callbacks = map(list, unzip(results))
     if overwrite:
         self.populations, self.records = populations, records
     return populations, records, callbacks
Exemple #11
0
def generate_image():
    # import plotly
    # plotly.io.orca.config.executable = '/home/miguel/anaconda3/bin/orca'
    layout = go.Layout(autosize=True, margin={'l': 0, 'r': 0, 't': 0, 'b': 0})
    players, score, ping, _ = unzip(get_players())
    players = list(players)
    score = list(score)
    ping = list(ping)
    height = len(list(players)) * 25 + 40
    fig = go.Figure(  # columnwidth=[1,0.5,0.5],
        layout=layout,
        data=[
            go.Table(
                columnwidth=[70, 15, 15],
                header=dict(
                    values=['<b>Player</b>', '<b>Score</b>', '<b>Ping</b>'],
                    line_color='darkslategray',
                    fill_color='lightskyblue',
                    font_size=18,
                    height=30,
                    align=['left', 'center', 'center']),
                cells=dict(values=[list(players),
                                   list(score),
                                   list(ping)],
                           height=25,
                           font_size=16,
                           line_color='darkslategray',
                           fill_color='lightcyan',
                           align=['left', 'center', 'center']))
        ])
    fig.update_layout(width=400, height=height)
    loc = '/tmp/players.jpg'
    fig.write_image('/tmp/players.jpg', engine='kaleido')
    return loc
Exemple #12
0
def detection_func(lines):
    try:
        _, line = lines.peek()
    except StopIteration:
        line = ""

    match = prompt_re.match(line)
    if not match:
        return None

    groups = match.groupdict()
    indent = len(groups["indent"])
    prompt_length = len(groups["prompt"])

    detected_lines = list(
        itertools.chain(
            [more_itertools.first(lines)],
            continuation_lines(lines, indent, prompt_length),
        ))
    line_numbers, lines = map(tuple, more_itertools.unzip(detected_lines))

    line_range = min(line_numbers), max(line_numbers) + 1
    if line_numbers != tuple(range(line_range[0], line_range[1])):
        raise RuntimeError("line numbers are not contiguous")

    return line_range, name, "\n".join(lines)
Exemple #13
0
    def test_pagination(self):

        index_to_page_size = {
            (page_size * num_pages + num_excess_doc, page_size)
            for page_size in (1, 2, 5) for num_pages in (0, 1, 2, 3)
            for num_excess_doc in (-1, 0, 1)
            if page_size * num_pages + num_excess_doc > 0
        }

        page_sizes_by_index_size = {
            i: list(unzip(page_sizes)[1])
            for i, page_sizes in groupby(sorted(index_to_page_size),
                                         key=itemgetter(0))
        }

        index_size_ = 0
        for index_size, page_sizes in page_sizes_by_index_size.items():
            self._add_docs(index_size - index_size_)
            for page_size in page_sizes:
                for sort_field, sort_path, sort_unique in [
                    ('entryId', ['entryId'], True),
                    ('fileId', ['files', 0, 'uuid'], True),
                    ('fileName', ['files', 0, 'name'], False)
                ]:
                    for reverse in False, True:
                        kwargs = dict(index_size=index_size,
                                      page_size=page_size,
                                      sort_field=sort_field,
                                      reverse=reverse)
                        with self.subTest(**kwargs):
                            self._test_pagination(**kwargs,
                                                  sort_path=sort_path,
                                                  sort_unique=sort_unique)
            index_size_ = index_size
Exemple #14
0
 def fuse_optimal(self,
                  cutoff=None,
                  scores=None,
                  fuse_func=None,
                  qrels=None,
                  ignore_zero=True,
                  return_vnos=False,
                  show_progress=True):
     qno_map = {}
     qnos, runqnos = unzip(self.qno_map.items())
     qnos, runqnos = list(qnos), list(runqnos)
     subqrels = [qrels.select_by_qno(x) for x in qnos]
     subscore = [{v: scores[v] for v in x.vnos()} for x in runqnos]
     with Pool(os.cpu_count() // 2) as pool:
         result = pool.imap_unordered(fuse_optimal_sp,
                                      zip(
                                          runqnos,
                                          repeat(cutoff),
                                          subscore,
                                          repeat(fuse_func),
                                          subqrels,
                                          repeat(ignore_zero),
                                          repeat(return_vnos),
                                      ),
                                      chunksize=32)
         if show_progress:
             result = tqdm(result, desc='Opt', total=len(qnos))
         if return_vnos:
             result = {x[0].qno: x for x in result}
             qno_map = {x: result[x][0] for x in qnos}
             qno_vnos = {x: result[x][1] for x in qnos}
             return TrecRun(qno_map), qno_vnos
         else:
             result = {x.qno: x for x in result}
             return TrecRun(result)
Exemple #15
0
async def dar_fetch_chunked(uuids, addrtype, chunk_size, client=None):
    """Lookup uuids in DAR (chunked).

    Args:
        uuids: List of DAR UUIDs.
        addr_type: The address type to lookup.
        chunk_size: Number of UUIDs per block, sent to DAR.
        client (optional): aiohttp.ClientSession to use for connecting.

    Returns:
        (dict, set):
            dict: Map from UUID to DAR reply.
            set: Set of UUIDs of entries which were not found.
    """
    def create_task(uuid_chunk):
        return asyncio.ensure_future(
            dar_fetch_non_chunked(uuid_chunk, addrtype=addrtype,
                                  client=client))

    # Chunk our UUIDs into blocks of chunk_size
    uuid_chunks = chunked(uuids, chunk_size)
    # Convert chunks into a list of asyncio.tasks
    tasks = list(map(create_task, uuid_chunks))
    # Here 'result' is a list of tuples (dict, set) => (result, missing)
    result = await asyncio.gather(*tasks)
    # First we unzip 'result' to get a list of results and a list of missing
    result_dicts, missing_sets = unzip(result)
    # Then we union the dicts and sets before returning
    combined_result = dict(ChainMap(*result_dicts))
    combined_missing = set.union(*missing_sets)
    return combined_result, combined_missing
Exemple #16
0
def validate(comparator: Comparator,
             data_dir: Path,
             validation_csv: Path,
             num_sample: int = 10 ** 3,
             pairs: Optional[Iterable[Tuple[int, int]]] = None,
             labels: Optional[Sequence[int]] = None
             ) -> Tuple[np.ndarray, np.ndarray]:
    if num_sample > 0:
        df = pd.read_csv(validation_csv)
        subject_dict = aggregate_subjects(df['TEMPLATE_ID'], df['SUBJECT_ID'])
        sampled_pairs, sampled_labels = unzip(sample_pairs(subject_dict, num_sample))
        sampled_labels = np.array(list(sampled_labels))
    else:
        sampled_pairs = pairs
        sampled_labels = np.array(labels)
    predictions = np.array(list(unzip(compare_all(data_dir, sampled_pairs, comparator))[2]))
    return sampled_labels, predictions
Exemple #17
0
 def run(self, ds: DataStream) -> DataStream:
     docs_ds = self.get_docs_stream(ds)
     processed_docs = map(self.process_doc, docs_ds, docs_ds.context)
     processed_docs = (x for x in processed_docs if x is not None)
     items, context = more_itertools.unzip(processed_docs)
     return DataStream(items=items,
                       applied_ops=ds.applied_ops + [self],
                       context=context)
Exemple #18
0
    def run(self, ds: DataStream) -> DataStream:
        flat = itertools.chain.from_iterable(map(self._flatten, ds,
                                                 ds.context))

        items, context = more_itertools.unzip(flat)
        return DataStream(items=items,
                          applied_ops=ds.applied_ops + [self],
                          context=context)
def make_pairs_with_lcs(structures_metadata, workers):    # groupby isoform
    # product apo holo
    # do following in multiple processes:
    #   find LCS
    # write LCS to output (if no mismatches?), or all?

    groups = structures_metadata.groupby('uniprotkb_id')  # or ísoform (maybe specify in args)
    # structures as files or codes, so that should be handled in `chains_for_uniprot_ids`? Or somehow with a join in filter_structures?
    # that could be done..
    def get_pairs():
        for uniprot_id, group_indices in groups.indices.items():
            for pair in get_pairs_in_group(structures_metadata, group_indices, uniprot_id):
                yield uniprot_id, pair

    uniprot_ids, pairs = more_itertools.unzip(get_pairs())
    pair_ids, lcs__args = more_itertools.unzip(pairs)

    i = 0
    print(datetime.now())

    # for uniprot_id, (apo_chain, holo_chain), lcs_future in zip(uniprot_ids, pair_ids, lcs_futures):
    for uniprot_id, (apo_chain, holo_chain), args in zip(uniprot_ids, pair_ids, lcs__args):
        lcs_future = FutureLike(process_execute(get_longest_common_polypeptide, *args))
        i += 1
        if i % 100 == 0:
            # if i % 100 == 0:
            maybe_print(False, f'\r{i}', end='')

        try:
            logger.info(f'getting result of {apo_chain} {holo_chain}, from {uniprot_id}')

            yield {
                'pdb_code_apo': apo_chain[0],  # todo could rename the variables to more general chain1 /c1, c2...
                'chain_id_apo': apo_chain[1],
                'pdb_code_holo': holo_chain[0],
                'chain_id_holo': holo_chain[1],
                'lcs_result': lcs_future.result(),
            }

        except Exception as e:
            logger.exception('compute_lcs failed with: ')

    print(datetime.now())
Exemple #20
0
def compare(data_path: Path,
            experiment: str,
            num_sample: int,
            use_flip: bool = False) -> None:
    val_csv = Path('data') / 'wide_val.csv'
    model_path = Path('experiments') / experiment / 'snapshots'
    num_weights = len(list(model_path.iterdir())) - 1
    results = []
    df = load_info(data_path, val_csv)
    exists = [
        idx for idx, cur_path in enumerate(df['img_path'])
        if cur_path.exists()
    ]
    val_data = df.iloc[np.array(exists)]
    subject_dict = aggregate_subjects(df['TEMPLATE_ID'], df['SUBJECT_ID'])
    sampled_pairs, sampled_labels = unzip(
        sample_pairs(subject_dict, num_sample))
    sampled_labels = np.array(list(sampled_labels))
    sampled_pairs = list(sampled_pairs)
    names = []
    rank_results = []
    for cur_epoch in range(7, num_weights):
        comparator = CompareModel(str(model_path / experiment),
                                  cur_epoch + 1,
                                  use_flip=use_flip,
                                  ctx=mx.gpu(0))
        comparator.metric = cosine
        rank_comparator = config_rank_comparator(comparator,
                                                 val_data['img_path'])
        cosine_res = validate(comparator,
                              data_path,
                              val_csv,
                              num_sample=0,
                              pairs=sampled_pairs,
                              labels=sampled_labels)
        rank_results.append(
            validate(rank_comparator,
                     data_path,
                     val_csv,
                     num_sample=0,
                     pairs=sampled_pairs,
                     labels=sampled_labels)[1])
        results.append(cosine_res)
        names.append(f'epoch {cur_epoch + 1:04d}')
        results.append((sampled_labels, rank_results[-1]))
        names.append(f'epoh {cur_epoch + 1:04d} rank')
    rank_merge_results = np.mean(rank_results, axis=0)
    results.append((sampled_labels, rank_merge_results))
    names.append('merged')

    plot_roc(
        results,
        experiment_names=names,
        save_name=f'{experiment}_{"flip" if use_flip else "no_flip"}_roc.png')
Exemple #21
0
 def check_value(variable, new_value, o):
     """Recurse through object to ensure correct value "new_value" in "variable"."""
     seeded_check_value = partial(check_value, variable, new_value)
     if isinstance(o, dict):
         if variable in o:
             if o[variable] == new_value:
                 return o, False
             o[variable] = new_value
             o["virkning"] = virkning
             return o, True
         keys, values = unzip(o.items())
         values, changed = unzip(map(seeded_check_value, values))
         return dict(zip(keys, values)), any(changed)
     elif isinstance(o, list):
         values, changed = unzip(map(seeded_check_value, o))
         return list(values), any(changed)
     elif isinstance(o, tuple):
         values, changed = unzip(map(seeded_check_value, o))
         return tuple(values), any(changed)
     else:
         return o, False
Exemple #22
0
    def collate_fn(inputs):
        (input_ids_tuple, token_type_ids, attention_mask, positions, widths,
         options, targets) = map(list, unzip(inputs))

        input_ids = pad_sequence([item[0] for item in input_ids_tuple],
                                 batch_first=True,
                                 padding_value=0)
        input_masked_ids = pad_sequence([item[1] for item in input_ids_tuple],
                                        batch_first=True,
                                        padding_value=0)
        token_type_ids = pad_sequence(token_type_ids,
                                      batch_first=True,
                                      padding_value=0)
        attn_masks_literal = pad_sequence([item[0] for item in attention_mask],
                                          batch_first=True,
                                          padding_value=0)
        attn_masks_idiomatic = pad_sequence(
            [item[1] for item in attention_mask],
            batch_first=True,
            padding_value=0)

        width_max = max(widths)
        gather_index = torch.arange(0, width_max,
                                    dtype=torch.long).unsqueeze(0).repeat(
                                        len(inputs), 1).clone()
        for i, (p, w) in enumerate(zip(positions, widths)):
            gather_index.data[i, :w] = torch.arange(p[0],
                                                    p[0] + w,
                                                    dtype=torch.long).data

        gather_index_masked = torch.arange(
            0, width_max,
            dtype=torch.long).unsqueeze(0).repeat(len(inputs), 1).clone()
        for i, (p, w) in enumerate(zip(positions, widths)):
            gather_index_masked.data[i, :w] = torch.arange(
                p[1], p[1] + w, dtype=torch.long).data

        batch = {
            'input_ids':
            torch.stack([input_ids, input_masked_ids]),
            'token_type_ids':
            torch.stack([token_type_ids, token_type_ids]),
            'attention_mask':
            torch.stack([attn_masks_literal, attn_masks_idiomatic]),
            'gather_index': (gather_index, gather_index_masked),
            'positions':
            torch.tensor(positions).long(),
            'option_ids':
            torch.tensor(options).long(),
            'targets':
            torch.tensor(targets).long()
        }
        return batch
Exemple #23
0
 def current(self):
     """
     Obtain the reward function which currently maximizes
     the objective = Value Function objective + Sparsity objective.
     """
     pairs = list(zip(self.coeffs, self.rewardBases))
     fn = lambda s: sum([c * rfn(s) for c, rfn in pairs])
     ranges = [rfn.reward_range for rfn in self.rewardBases]
     mins, maxs = list(map(list, unzip(ranges)))
     rMin = min(c * m for c, m in zip(self.coeffs, mins))
     rMax = max(c * M for c, M in zip(self.coeffs, maxs))
     return Reward(fn, (rMin, rMax))
Exemple #24
0
def take_unchanged(mating_group: t.List[t.Tuple[GraphIndividual, Record]],
                   brood_size: int) -> t.List[GraphIndividual]:
    """
    Randomly takes `brood_size` number of individuals from the mating groups.
    :param mating_group: A group of individuals selected to give progeny.
    :param brood_size: A number of offsprings.
    :return: List of offsprings -- copies of the parents.
    """
    individuals, _ = unzip(mating_group)
    return list(
        take(brood_size,
             (ind.copy() for ind in random_permutation(individuals))))
Exemple #25
0
def main(input, part):
    # Iterator of lines
    lines = map(lambda x: x.strip(), input.readlines())
    # Iterator of integers
    integers = list(map(int, lines))
    # integers = [28, 33, 18, 42, 31, 14, 46, 20, 48, 47, 24, 23, 49, 45, 19, 38, 39, 11, 1, 32, 25, 35, 8, 17, 7, 9, 4, 2, 34, 10, 3]
    # integers = [16, 10, 15, 5, 1, 11, 7, 19, 6, 12, 4]
    integers = sorted(integers)

    min_jolts = 0
    max_jolts = integers[-1] + 3
    diff_integers = chain([min_jolts], integers, [max_jolts])
    differences = list(map(lambda a, b: b - a,
                           *unzip(pairwise(diff_integers))))

    if part == "1":
        differences = Counter(differences)
        print(differences[1] * differences[3])
    if part == "2":
        # We do not have to consider all possible combinations, as all the
        # combinations will contain certain sequences, namely all sequences
        # will passthrough 3-difference numbers, and thus the sequences on
        # either side of these 3-difference numbers are independent.
        #
        # Thus we can split the problem into several subproblems, one on either
        # side of the 3-difference numbers.
        @apply
        def difference_is_3(index, value):
            """Check if the current index is a 3-difference number."""
            return differences[index] == 3

        # Iterator of subproblems (lists seperated by 3-difference numbers)
        # Each element in the lists are (index, value)
        subproblems = split_after(enumerate(integers), difference_is_3)
        # Map each element in the lists to just value (drop index)
        subproblems = map(lambda element: list(unzip(element)[1]), subproblems)
        # Find number of possible combinations for each block
        sub_counts = map(find_arrangements, subproblems)
        # Multiple the values for all the blocks to get a total
        print(prod(sub_counts))
Exemple #26
0
def get_predicts(data_path, cache_dir, img_paths, pairs):
    img_matcher = config_resnet_matcher(img_paths, cache_dir)
    detector = pipeline_detector(img_paths, cache_dir / 'detector', small_face=16)
    experiment_names = ['ultimate5', 'test_center_vgg']
    epochs = [20, 10]
    experiment_path = Path('experiments')
    feature_extractors = [mxnet_feature_extractor(
        cache_dir / f'extractor_{cur_exp}_{cur_epoch:04d}',
        str(experiment_path / cur_exp / 'snapshots' / cur_exp),
        cur_epoch, use_flip=True, ctx=mx.gpu(0))
        for cur_exp, cur_epoch in zip(experiment_names, epochs)]
    comparator = PipeMatcher(img_paths, cache_dir, img_matcher, detector, feature_extractors)
    return np.array(list(unzip(compare_all(data_path, pairs, comparator))[2]))
Exemple #27
0
def validate_pipe():
    cache_dir = Path('/run/media/andrey/Data/pipe_cache')
    data_path = Path('/run/media/andrey/Fast/FairFace/data/train/data')
    val_csv = Path('data') / 'val_df.csv'
    val_data = load_info(data_path, val_csv)
    num_sample = 1 * 10 ** 5
    subject_dict = aggregate_subjects(val_data['TEMPLATE_ID'], val_data['SUBJECT_ID'])
    sampled_pairs, sampled_labels = unzip(sample_pairs(subject_dict, num_sample))
    sampled_labels = np.array(list(sampled_labels))
    sampled_pairs = list(sampled_pairs)
    predictions = get_predicts(data_path, cache_dir, val_data['img_path'], sampled_pairs)
    plot_roc([(sampled_labels, predictions)], ['composite'],
             save_name='test_pipe.png')
Exemple #28
0
 def run(self, ds: stream.DataStream) -> stream.DataStream:
     docs_ds = self.get_docs_stream(ds)
     docs = zip(docs_ds, docs_ds.context)
     # match results is a tuple ((doc, matches), context)
     match_results = self.matcher.pipe(docs,
                                       return_matches=True,
                                       as_tuples=True)
     new_docs_with_context = more_itertools.map_except(
         self._filter_tokens, match_results, EmptyTextError)
     new_docs, context = more_itertools.unzip(new_docs_with_context)
     return stream.DataStream(new_docs,
                              applied_ops=ds.applied_ops + [self],
                              context=context)
Exemple #29
0
    def _get_batch(self, bs: int, x, y=None):
        y_x_pairs = zip(y, x) if y is not None else enumerate(x)
        for batch in cytoolz.partition_all(bs, y_x_pairs):
            batch_y, batch_x = more_itertools.unzip(batch)
            X, Y = list(batch_x), list(batch_y)

            if sparse.issparse(Y[0]):
                Y = sparse.vstack(Y)
            elif isinstance(Y[0], np.ndarray):
                Y = np.vstack(Y)
            if sparse.issparse(X[0]):
                X = sparse.vstack(X)
            elif isinstance(X[0], np.ndarray):
                X = np.vstack(X)
            yield X, Y
Exemple #30
0
def splot_multiple(
    *exprs: CurveOrRange,
    plotf: splot, random_colors=False,
    **options):
    "plots multiple curves with extra kwargs for each one"
    show = options.pop("show", True)
    curves, curves_args = unzip(curves_iter(exprs))
    #pprint(curves)
    plot = plotf(*curves, show=False, **options)
    for i, curve_args in enumerate(curves_args):
        if random_colors:
            plot[i].line_color = [random_bright_rgb_color()]
        for key, value in curve_args.items():
            setattr(plot[i], key, value)
    plot.show() if show else None
    return plot
Exemple #31
0
    def scrape_candidates(self, product_name, archive_directory, major_version, stdout):
        """Scrape the candidates/ directory for beta, release candidate, and final releases."""
        url_path = '/pub/%s/candidates/' % archive_directory
        stdout.write('scrape_candidates working on %s' % url_path)

        # First, let's look at /pub/PRODUCT/releases/ so we know what final
        # builds have been released
        release_path = '/pub/%s/releases/' % archive_directory
        release_path_content = self.download(release_path)

        # Get the final release version numbers, so something like "64.0b8/" -> "64.0b8"
        final_releases = [
            link['text'].rstrip('/') for link in self.get_links(release_path_content)
            if link['text'][0].isdigit()
        ]

        content = self.download(url_path)
        version_links = [
            link for link in self.get_links(content)
            if link['text'][0].isdigit()
        ]

        # If we've got a major_version, then we only want to scrape data for versions
        # greater than (major_version - 4) and esr builds
        if major_version:
            major_version_minus_4 = major_version - 4
            stdout.write(
                'skipping anything before %s and not esr (%s)' %
                (product_name, major_version_minus_4)
            )
            version_links = [
                link for link in version_links
                if (
                    # "63.0b7-candidates/" -> 63
                    int(link['text'].split('.')[0]) >= major_version_minus_4 or
                    'esr' in link['text']
                )
            ]

        scrape = partial(
            self.scrape_candidate_version,
            product_name=product_name,
            final_releases=final_releases
        )

        if self.num_workers == 1:
            results = map(scrape, version_links)

        else:
            with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor:
                results = executor.map(scrape, version_links, timeout=300)

        results = list(results)
        # Convert [(build_data, msgs), (build_data, msgs), ...] into
        # build_data and msgs
        if results:
            build_data, msgs = more_itertools.unzip(results)
        else:
            build_data, msgs = [], []

        # Print all the msgs to stdout
        for msg_group in msgs:
            for msg in msg_group:
                stdout.write('worker: %s' % msg)

        # build_data is a list of lists so we flatten that
        return list(more_itertools.flatten(build_data))