Beispiel #1
0
def save_annotation(data):
    user_id = data['user_id']
    annotations = data['annotations']

    sample_to_platform = dict(
        Sample.objects.filter(id__in=annotations).values_list(
            'id', 'platform_id'))
    groups = group_by(lambda pair: sample_to_platform[pair[0]],
                      annotations.items())

    for platform_id, annotations in groups.items():
        canonical = SeriesAnnotation.objects.create(
            platform_id=platform_id,
            annotations=1,
            authors=1,
            **project(data, ['series_id', 'tag_id', 'column', 'regex']))
        canonical.fill_samples(annotations)

        raw_annotation = RawSeriesAnnotation.objects.create(
            canonical=canonical,
            platform_id=platform_id,
            created_by_id=user_id,
            **project(
                data,
                ['series_id', 'tag_id', 'column', 'regex', 'note', 'from_api'
                 ]))
        raw_annotation.fill_samples(annotations)

        ValidationJob.objects.create(annotation=canonical)
Beispiel #2
0
 def extract_map(name_map, names):
     lookup_root = fn.merge(*({v: k
                               for v in vals} for k, vals in name_map))
     mapping = fn.group_by(lambda x: lookup_root[x.split('##time_')[0]],
                           names)
     mapping = fn.walk_values(tuple, mapping)  # Make hashable.
     return frozenset(mapping.items())
Beispiel #3
0
def _push(
    repo,
    git_remote: str,
    refs: Iterable["ExpRefInfo"],
    force: bool,
) -> Mapping[SyncStatus, List["ExpRefInfo"]]:
    from scmrepo.exceptions import AuthError

    from ...scm import GitAuthError

    refspec_list = [f"{exp_ref}:{exp_ref}" for exp_ref in refs]
    logger.debug(f"git push experiment '{refs}' -> '{git_remote}'")

    with TqdmGit(desc="Pushing git refs") as pbar:
        try:
            results: Mapping[str, SyncStatus] = repo.scm.push_refspecs(
                git_remote,
                refspec_list,
                force=force,
                progress=pbar.update_git,
            )
        except AuthError as exc:
            raise GitAuthError(str(exc))

    def group_result(refspec):
        return results[str(refspec)]

    pull_result: Mapping[SyncStatus, List["ExpRefInfo"]] = group_by(
        group_result, refs
    )

    return pull_result
Beispiel #4
0
def buildDict(prefix, urlx):
    rd = jutils.getHttpCsv(urlx)
    rdict = {}
    wlist = []
    for row in rd:
        cols = row.split(',')
        if len(cols) > 5:
            wlist.append(cols[5])
    w2list = [i for i in wlist if len(i) >= CHAR_MIN and len(i) <= CHAR_MAX]
    #print jj(w2list[:50])
    #pr = partListToDict(w2list,keyFuncFirstChar)
    pr = funcy.group_by(lambda x: x[0], w2list)
    c = Counter([i[0] for i in w2list]).most_common(MOST_LIMIT)
    # print jj(c[:24])
    # print jj(pr[c[0][0]])
    fmck = [i[0] for i in c]
    fmcv = [i[1] for i in c]
    # 256*8=2048
    # 512*4=2048
    wordlist = funcy.flatten([pr[x][:4] for x in fmck])
    rdict['data'] = funcy.select_keys(lambda x: x in fmck, pr)
    rdict['meta'] = {
        'source': urlx,
        'wordlist': wordlist,
        'firstMostCommonKey': fmck,
        'firstMostCommonCount': fmcv,
        'host': 'http://data.gov.tw',
        'build': 'http://console.tw',
        'script':
        'https://github.com/y12studio/console-tw/tree/master/projects/datachart/',
        'prefix': prefix,
        'time': datetime.datetime.utcnow().isoformat()
    }
    return rdict
Beispiel #5
0
def vim_leave_pre():
    ''' Remove all pyunite states in each tabpage '''
    with restore(vim.current.window), restore(vim.current.tabpage):
        states_by_tab = fn.group_by(itemgetter('tabpage_from'), variables.states)
        # error(str(dict(states_by_tab)))
        for tabpage, states in states_by_tab.items():
            change_tabpage(tabpage)
            map(remove_state, states)
Beispiel #6
0
    def _normalize_options(self, query, options):
        options = DatatableOptions._normalize_options(self, query, options)

        filters = group_by(r'^(GSE|GPL|)', options['search'].split())
        options['search'] = ''.join(filters.pop('', []))
        options['filters'] = filters

        return options
Beispiel #7
0
def group_by_attr(attr: str, it: Iterable) -> dict:
    """

    :param attr: attribute expected to be on any item in it
    :param it: any iterable
    :return: dict -> where any item x in it, will be in a bucket key of which is x.attr
    """
    return dict(group_by(operator.attrgetter(attr), it))
Beispiel #8
0
def load_data(header):
    print(colored('Found %d data lines' % len(header), 'cyan'))
    line_groups = group_by(r'^!([^_]+)_', header)

    # Load series
    series_df = get_df_from_lines(line_groups['Series'], 'Series')
    assert len(series_df.index) == 1
    missing = REQUIRED_SERIES_FIELDS - set(series_df.columns)
    if missing:
        cprint(
            'Skip incomplete header: %s column%s missing' %
            (', '.join(sorted(missing)), 's' if len(missing) > 1 else ''),
            'red')
        return

    gse_name = series_df['series_geo_accession'][0]

    # Skip multispcecies
    if '|\n|' in series_df['series_platform_taxid'][0]:
        cprint('Skip multispecies', 'red')
        return
    if series_df['series_platform_taxid'][0] != series_df[
            'series_sample_taxid'][0]:
        cprint('Skip sample-platform species mismatch', 'red')
        return

    # Check if series updated
    try:
        old_last_update = Series.objects.get(
            gse_name=gse_name).attrs.get('last_update_date')
    except Series.DoesNotExist:
        old_last_update = None
    new_last_update = series_df['series_last_update_date'][0]
    if new_last_update == old_last_update:
        print(
            colored('%s not changed since %s' % (gse_name, old_last_update),
                    'yellow'))
        return
    else:
        print(
            colored(
                '%s updated %s -> %s' %
                (gse_name, old_last_update, new_last_update), 'green'))

    # Load samples
    try:
        samples_df = get_df_from_lines(line_groups['Sample'], 'Sample')
    except pd.errors.ParserError as e:
        cprint('Failed to parse sample lines: %s' % e, 'red')
        return
    samples_df['gsm_name'] = samples_df.sample_geo_accession
    samples_df = samples_df.set_index('gsm_name')

    insert_or_update_data(series_df, samples_df)
Beispiel #9
0
def iterable_per_line(triples):
    """Yield iterables of (key, value mapping), one for each line."""
    # Jam all the triples of a file into a hash by line number:
    line_map = group_by(lambda (k, v, extent): extent.start.row, triples)  # {line: triples}
    last_line = max(line_map.iterkeys()) + 1 if line_map else 1

    # Pull out the needles for each line, stripping off the extents and
    # producing a blank list for missing lines. (The defaultdict returned from
    # group_by takes care of the latter.)
    return [[(k, v) for (k, v, e) in line_map[line_num]]
            for line_num in xrange(1, last_line)]
Beispiel #10
0
Datei: scm.py Projekt: nik123/dvc
def iter_revs(
    scm: "Git",
    head_revs: Optional[List[str]] = None,
    num: int = 1,
    all_branches: bool = False,
    all_tags: bool = False,
    all_commits: bool = False,
    all_experiments: bool = False,
) -> Mapping[str, List[str]]:
    from dvc.repo.experiments.utils import fix_exp_head

    if num < 1 and num != -1:
        raise InvalidArgumentError(f"Invalid number of commits '{num}'")

    if not any(
        [head_revs, all_branches, all_tags, all_commits, all_experiments]
    ):
        return {}

    head_revs = head_revs or []
    revs = []
    for rev in head_revs:
        revs.append(rev)
        n = 1
        while True:
            if num == n:
                break
            try:
                head = fix_exp_head(scm, f"{rev}~{n}")
                assert head
                revs.append(resolve_rev(scm, head))
            except RevError:
                break
            n += 1

    if all_commits:
        revs.extend(scm.list_all_commits())
    else:
        if all_branches:
            revs.extend(scm.list_branches())

        if all_tags:
            revs.extend(scm.list_tags())

    if all_experiments:
        from dvc.repo.experiments.utils import exp_commits

        revs.extend(exp_commits(scm))

    rev_resolver = partial(resolve_rev, scm)
    return group_by(rev_resolver, revs)
Beispiel #11
0
def sha1_copies(files):
    """ Find files identical sha1.

    Args:
        files(Iterable[PurePath]): Iterable of files.

    Yields:
        list: lists of files identical by sha1
    """
    file_hashes = group_by(get_file_sha1, files)

    for sha1, copy_files in file_hashes.items():
        if len(copy_files) > 1:
            yield copy_files
Beispiel #12
0
 def latest_instances(self):
     """Group instances by their component and and return latest within each group."""
     component_lookup = {component.object_id: component for component in self.objects("COMPONENT")}
     grouped_instances = funcy.group_by(attrgetter("component_parent"), self.objects("INSTANCE"))
     logger.info(
         "Pipeline '%s' has %d components and %d instances, looking for latest instances",
         self.pipeline_id,
         len(grouped_instances),
         sum(map(len, grouped_instances.values())),
     )
     for component_parent in sorted(grouped_instances):
         latest_instance = sorted(grouped_instances[component_parent], key=attrgetter("scheduled_start_time"))[-1]
         latest_instance.parent_object = component_lookup[component_parent]
         yield latest_instance
    def query_dnf(query):
        def table_for(alias):
            if alias == main_alias:
                return alias
            return query.alias_map[alias].table_name

        dnf = _dnf(query.where)

        # NOTE: we exclude content_type as it never changes and will hold dead invalidation info
        main_alias = query.model._meta.db_table
        aliases = {alias for alias, join in query.alias_map.items()
                   if query.alias_refcount[alias] and table_tracked(join.table_name)} \
                | {main_alias} - {'django_content_type'}
        tables = group_by(table_for, aliases)
        return {table: clean_dnf(dnf, table_aliases) for table, table_aliases in tables.items()}
Beispiel #14
0
    def query_dnf(query):
        def table_for(alias):
            if alias == main_alias:
                return alias
            return query.alias_map[alias].table_name

        dnf = _dnf(query.where)

        # NOTE: we exclude content_type as it never changes and will hold dead invalidation info
        main_alias = query.model._meta.db_table
        aliases = {alias for alias, (join, cnt) in zip_dicts(query.alias_map, query.alias_refcount)
                   if cnt and family_has_profile(table_to_model(join.table_name))} \
                | {main_alias} - {'django_content_type'}
        tables = group_by(table_for, aliases)
        return {table: clean_dnf(dnf, table_aliases) for table, table_aliases in tables.items()}
Beispiel #15
0
def bounding_box(r: Rec, oracle):
    """Compute Bounding box. TODO: clean up"""
    recs = list(box_edges(r))

    tops = [(binsearch(r2, oracle)[1].top,
             tuple((np.array(r2.top) - np.array(r2.bot) != 0))) for r2 in recs]
    tops = fn.group_by(ig(1), tops)

    def _top_components():
        for key, vals in tops.items():
            idx = key.index(True)
            yield max(v[0][idx] for v in vals)

    top = np.array(list(_top_components()))
    intervals = tuple(zip(r.bot, top))
    return to_rec(intervals=intervals)
Beispiel #16
0
def group_needles(line_needles):
    """Group line needles by line, and return a list of needles for each line,
    up to the last line with any needles::

        [(a, 1), (b, 4), (c, 4)] -> [[a], [], [], [b, c]]

    """
    # Jam all the needles of a file into a hash by line number:
    line_map = group_by(itemgetter(1), line_needles)  # {line: needles}
    last_line = max(line_map.iterkeys()) + 1 if line_map else 1

    # Pull out the needles for each line, stripping off the line number
    # elements of the tuples and producing a blank list for missing lines.
    # (The defaultdict returned from group_by takes care of the latter.)
    return [[pair for (pair, _) in line_map[line_num]]
            for line_num in xrange(1, last_line)]
Beispiel #17
0
def group_needles(line_needles):
    """Group line needles by line, and return a list of needles for each line,
    up to the last line with any needles::

        [(a, 1), (b, 4), (c, 4)] -> [[a], [], [], [b, c]]

    """
    # Jam all the needles of a file into a hash by line number:
    line_map = group_by(itemgetter(1), line_needles)  # {line: needles}
    last_line = max(line_map.iterkeys()) + 1 if line_map else 1

    # Pull out the needles for each line, stripping off the line number
    # elements of the tuples and producing a blank list for missing lines.
    # (The defaultdict returned from group_by takes care of the latter.)
    return [[pair for (pair, _) in line_map[line_num]]
            for line_num in xrange(1, last_line)]
Beispiel #18
0
Datei: scm.py Projekt: jhhuh/dvc
def iter_revs(
    scm: "Git",
    revs: Optional[List[str]] = None,
    num: int = 1,
    all_branches: bool = False,
    all_tags: bool = False,
    all_commits: bool = False,
    all_experiments: bool = False,
) -> Mapping[str, List[str]]:

    if not any([revs, all_branches, all_tags, all_commits, all_experiments]):
        return {}

    revs = revs or []
    results = []
    for rev in revs:
        if num == 0:
            continue
        results.append(rev)
        n = 1
        while True:
            if num == n:
                break
            try:
                head = f"{rev}~{n}"
                results.append(resolve_rev(scm, head))
            except RevError:
                break
            n += 1

    if all_commits:
        results.extend(scm.list_all_commits())
    else:
        if all_branches:
            results.extend(scm.list_branches())

        if all_tags:
            results.extend(scm.list_tags())

    if all_experiments:
        from dvc.repo.experiments.utils import exp_commits

        results.extend(exp_commits(scm))

    rev_resolver = partial(resolve_rev, scm)
    return group_by(rev_resolver, results)
Beispiel #19
0
def get_mbta_station_info(cfg):
    route_info = query_mbta_id("routes", cfg['route'])
    stop_info = query_mbta_id("stops", cfg['stop'])

    params = (('filter[stop]', cfg['stop']), ('filter[route]', cfg['route']),
              ('page[limit]', '10'))
    arrivals = query_mbta('predictions', params)
    by_direction = f.walk_values(
        vectorize(f.compose(relative_ts, op.itemgetter('arrival_time'))),
        f.group_by(op.itemgetter('direction_id'), arrivals))

    return [
        f.merge(
            {
                "station": stop_info['name'],
                "route": cfg['route'],
                "direction": route_info['direction_destinations'][k],
            }, dict(zip(range(5), pad(v, 5))))
        for k, v in by_direction.items()
    ]
Beispiel #20
0
    def _normalize_options(self, query, options):
        """
        Here we parse some search tokens diffrently to enable filtering:
            GSE\d+ and GPL\d+    filter by specific serie or platform
            tag=\w+              filters by tag
            valid                selects validated annotations
        """
        options = DatatableOptions._normalize_options(self, query, options)
        # Try normally named field
        if not options['search']:
            options['search'] = query.get('search', '').strip()

        filters = group_by(r'^(GSE|GPL|[Tt]ag=|valid|novalid)',
                           options['search'].split())
        options['search'] = ' '.join(filters.pop(None, []))

        filters = walk_keys(str.lower, filters)
        filters['tag'] = lmap(r'^[Tt]ag=(.*)', filters.pop('tag=', []))
        options['filters'] = filters

        return options
Beispiel #21
0
def load_data(header):
    print colored('Found %d data lines' % len(header), 'cyan')
    line_groups = group_by(r'^!([^_]+)_', header)

    # Load series
    series_df = get_df_from_lines(line_groups['Series'], 'Series')
    assert len(series_df.index) == 1
    gse_name = series_df['series_geo_accession'][0]

    # Skip multispcecies
    if '|\n|' in series_df['series_platform_taxid'][0]:
        cprint('Skip multispecies', 'red')
        return
    if series_df['series_platform_taxid'][0] != series_df[
            'series_sample_taxid'][0]:
        cprint('Skip sample-platform species mismatch', 'red')
        return

    # Check if series updated
    try:
        old_last_update = Series.objects.get(
            gse_name=gse_name).attrs.get('last_update_date')
    except Series.DoesNotExist:
        old_last_update = None
    new_last_update = series_df['series_last_update_date'][0]
    if new_last_update == old_last_update:
        print colored('%s not changed since %s' % (gse_name, old_last_update),
                      'yellow')
        return
    else:
        print colored(
            '%s updated %s -> %s' %
            (gse_name, old_last_update, new_last_update), 'green')

    # Load samples
    samples_df = get_df_from_lines(line_groups['Sample'], 'Sample')
    samples_df['gsm_name'] = samples_df.sample_geo_accession
    samples_df = samples_df.set_index('gsm_name')

    insert_or_update_data(series_df, samples_df)
Beispiel #22
0
def draw_graph_animation(graph):
    vertices_names = graph.new_vertex_property('string')
    graph.vertex_properties['vertices_names'] = vertices_names

    for vertex in graph.vertices():
        vertices_names[vertex] = \
            graph.vertex_properties['actors_on_vertices'][vertex] + \
            ' ' + str(graph.vertex_properties['pagerank'][vertex])

    graph.vertex_properties['pos'] = sfdp_layout(
        graph, eweight=graph.edge_properties['weights_on_edges'])

    dir_name = 'pagerank/' + \
        graph.graph_properties['repo_on_graph'].replace('/', '%') + '/'

    os.mkdir(dir_name)

    def event_bulk(vertex):
        event = graph.vertex_properties['events_on_vertices'][vertex]
        return event['created_at'].strftime("%Y-%m-%d %H")

    batch_sizes = map(
        lambda x: len(x[1]),
        sorted(group_by(event_bulk, graph.vertices()).items(),
               key=lambda x: x[0]))

    def tail_number(n):
        if n == 0:
            return batch_sizes[0]
        else:
            return tail_number(n - 1) + batch_sizes[n]

    batch_numbers = map(tail_number, range(len(batch_sizes)))

    map(draw_graph_frame, map(lambda x: (graph, dir_name, x), batch_numbers))

    images = [Image.open(dir_name + str(i) + '.png') for i in batch_numbers]

    writeGif(dir_name + 'animation.gif', images, duration=0.1)
def draw_graph_animation(graph):
    vertices_names = graph.new_vertex_property('string')
    graph.vertex_properties['vertices_names'] = vertices_names

    for vertex in graph.vertices():
        vertices_names[vertex] = \
            graph.vertex_properties['actors_on_vertices'][vertex] + \
            ' ' + str(graph.vertex_properties['pagerank'][vertex])

    graph.vertex_properties['pos'] = sfdp_layout(
        graph, eweight=graph.edge_properties['weights_on_edges'])

    dir_name = 'pagerank/' + \
        graph.graph_properties['repo_on_graph'].replace('/', '%') + '/'

    os.mkdir(dir_name)

    def event_bulk(vertex):
        event = graph.vertex_properties['events_on_vertices'][vertex]
        return event['created_at'].strftime("%Y-%m-%d %H")

    batch_sizes = map(lambda x: len(x[1]), sorted(group_by(
        event_bulk, graph.vertices()).items(), key=lambda x: x[0]))

    def tail_number(n):
        if n == 0:
            return batch_sizes[0]
        else:
            return tail_number(n - 1) + batch_sizes[n]

    batch_numbers = map(tail_number, range(len(batch_sizes)))

    map(draw_graph_frame, map(
        lambda x: (graph, dir_name, x), batch_numbers))

    images = [Image.open(dir_name + str(i) + '.png') for i in batch_numbers]

    writeGif(dir_name + 'animation.gif', images, duration=0.1)
    def handle(self, **options):
        SeriesAnnotation.objects.filter(annotations__gt=0).update(is_active=True)

        qs = RawSeriesAnnotation.objects.order_by('id')
        by_canonical = group_by(lambda a: a.canonical_id, qs)

        for anno in tqdm(qs):
            if anno.ignored or anno.by_incompetent:
                continue
            last_anno = by_canonical[anno.canonical_id][-1]
            if not samples_match(anno, last_anno):
                anno.is_active = False
                anno.obsolete = True
                anno.save()
            else:
                anno.obsolete = False
                try:
                    anno.is_active = True
                    anno.save()
                except IntegrityError:
                    anno.is_active = False
                    anno.note += '# dup'
                    anno.save()
Beispiel #25
0
def find_copies(path_to_dir, delete):
    dir_path = Path(path_to_dir)

    dir_iter = recursion_finder(dir_path)
    # group all files by size and filter 0-sized files
    file_sizes = group_by(get_file_size, dir_iter)
    file_sizes = select_keys(None, file_sizes)
    # get groups of files and filter one-members groups
    files_groups = filter(not_alone_item, file_sizes.values())

    # view copies grouped by sha1
    for copies in sha1_copies_from_groups(files_groups):

        show_list_files(copies, TEXTS['identical_files'])

        if not delete:
            continue

        # wait user input
        nums = get_nums_for_delete(copies)

        # if users choice not delete files
        if 0 in nums:
            continue

        files_to_delete = [copies[num - 1] for num in nums]

        # show files to delete
        show_list_files(files_to_delete, TEXTS['delete_list'])

        if click.confirm(TEXTS['confirm']):
            delete_files(files_to_delete)
            click.echo(TEXTS['delete_success'])
        else:
            click.echo(TEXTS['delete_aborted'])

        click.echo('=' * 20)
Beispiel #26
0
def _pull(
    repo,
    git_remote: str,
    refs: Iterable["ExpRefInfo"],
    force: bool,
) -> Mapping[SyncStatus, List["ExpRefInfo"]]:
    refspec_list = [f"{exp_ref}:{exp_ref}" for exp_ref in refs]
    logger.debug(f"git pull experiment '{git_remote}' -> '{refspec_list}'")

    with TqdmGit(desc="Fetching git refs") as pbar:
        results: Mapping[str, SyncStatus] = repo.scm.fetch_refspecs(
            git_remote,
            refspec_list,
            force=force,
            progress=pbar.update_git,
        )

    def group_result(refspec):
        return results[str(refspec)]

    pull_result: Mapping[SyncStatus,
                         List["ExpRefInfo"]] = group_by(group_result, refs)

    return pull_result
Beispiel #27
0
def categorize(ast):
    """Group ast nodes based on their type."""
    return group_by(_categorize, ast.walk_down())
Beispiel #28
0
def calc_validation_stats(serie_validation_pk, recalc=False):
    serie_validation = SerieValidation.objects.select_for_update().get(
        pk=serie_validation_pk)
    # Guard from double update, so that user stats won't be messed up
    if not recalc and serie_validation.samples_total is not None:
        return
    series_tag = serie_validation.series_tag
    if not series_tag:
        return

    # Compare to annotation
    sample_validations = serie_validation.sample_validations.all()
    sample_annotations = series_tag.sample_tags.all()

    if set(r.sample_id for r in sample_validations) \
            != set(r.sample_id for r in sample_annotations):
        logger.error("Sample sets mismatch for validation %d" %
                     serie_validation_pk)
        # It's either bug when making annotation or samples set really changed
        series_tag.is_active = False
        series_tag.save()
        # TODO: notify annotation author to redo it
        return

    _fill_concordancy(sample_validations, sample_annotations)

    # Fill serie validation stats
    serie_validation.samples_total = len(sample_validations)
    serie_validation.samples_concordant = sum(s.concordant
                                              for s in sample_validations)
    serie_validation.annotation_kappa = _cohens_kappa(sample_validations,
                                                      sample_annotations)

    # Compare to other validations
    earlier_validations = series_tag.validations.filter(pk__lt=serie_validation_pk, ignored=False) \
                                    .order_by('pk')
    # TODO: use .prefetch_related()
    earlier_sample_validations = group_by(
        lambda v: v.serie_validation_id,
        SampleValidation.objects.filter(
            serie_validation__in=earlier_validations))

    if not serie_validation.concordant:
        serie_validation.agrees_with = first(
            v for v in earlier_validations
            if v.created_by_id != serie_validation.created_by_id
            and is_samples_concordant(earlier_sample_validations[v.pk],
                                      sample_validations))

    # NOTE: this includes kappas against your prev validations
    serie_validation.best_kappa = max(
        chain([serie_validation.annotation_kappa],
              (_cohens_kappa(sample_validations, sv)
               for sv in earlier_sample_validations.values())))
    serie_validation.save()

    # Calculate fleiss kappa for all existing annotations/validations
    annotation_sets = [sample_annotations, sample_validations] \
        + earlier_sample_validations.values()
    series_tag.fleiss_kappa = _fleiss_kappa(annotation_sets)
    if not serie_validation.on_demand and not serie_validation.ignored \
            and (serie_validation.concordant or serie_validation.agrees_with):
        series_tag.agreed = earlier_validations.count() + 1
    series_tag.save()

    # TODO: make this separate task ?
    if not recalc and not serie_validation.on_demand and not serie_validation.by_incompetent:
        _update_user_stats(serie_validation)  # including payment ones

    # TODO: make this separate task ?
    # Reschedule validation if no agreement found
    if not series_tag.agreed and not recalc and not serie_validation.on_demand \
            and not serie_validation.by_incompetent:
        # Schedule revalidations with priority < 0, that's what new validations have,
        # to phase out garbage earlier
        _reschedule_validation(serie_validation,
                               priority=series_tag.fleiss_kappa - 1)
Beispiel #29
0
def group(events):
    return group_by(lambda x: x['repo'], events)
Beispiel #30
0
def iuniq(func, lst):
    return imap(fn.first, fn.group_by(func, lst).itervalues())
Beispiel #31
0
def modulo_group(n: int, seq: Iterable) -> List[List[Any]]:
    grouped = group_by(lambda l: l[0] % n, enumerate(seq))
    return lmap(lambda l: lmap(lambda t: t[1], l),
                map(lambda g: grouped[g], range(0, n)))
Beispiel #32
0
def summarize_evaluation(eval_dir,
                         selection_metric="val_accuracy",
                         ignore_worst=0):
    if not eval_dir.exists():
        print(f"No evalutation '{eval_dir}' found.")
        return

    with open(eval_dir / "config.json") as f:
        config = json.load(f)

    with open(eval_dir / "hyperparams.json") as f:
        hps = json.load(f)

    results_dir = eval_dir / "results"
    assert results_dir.exists(), f"No results found for '{eval_dir}'."
    summary_dir = eval_dir / "summary"

    if not summary_dir.exists():
        os.makedirs(summary_dir)

    result_files = [(list(fy.map(int, f[:-5].split("-"))), results_dir / f)
                    for f in os.listdir(results_dir)]

    fold_files = fy.group_by(lambda f: f[0][0], result_files)
    fold_param_files = {
        fold: fy.group_by(lambda f: f[0][1], files)
        for fold, files in fold_files.items()
    }
    folds = list(fold_param_files.items())
    folds.sort(key=fy.first)

    best_goal = selection_metrics[selection_metric]

    results = []
    all_hps = True

    for fold_i, param_files in folds:
        best_res = None
        param_file_items = list(param_files.items())

        all_hps = all_hps and len(param_files) == len(hps)

        for hp_i, files in param_file_items:
            hp_train_results = defaultdict(list)
            hp_test_results = defaultdict(list)
            selection_vals = []
            all_selection_vals = []
            for (_, _, i), file in files:
                with open(file, "r") as f:
                    result = json.load(f)

                selection_val = result["train"][selection_metric][-1]
                all_selection_vals.append(selection_val)
                if i < config["repeat"]:
                    selection_vals.append(selection_val)

                for metric, val in result["train"].items():
                    hp_train_results[metric].append(val[-1])
                for metric, val in result["test"].items():
                    hp_test_results[metric].append(val)

            top_idxs = np.argsort(np.array(all_selection_vals))

            if len(all_selection_vals) > ignore_worst:
                if best_goal == "max":
                    top_idxs = top_idxs[ignore_worst:]
                elif best_goal == "min":
                    top_idxs = top_idxs[:-ignore_worst]

            top_statistics = fy.compose(statistics,
                                        lambda l: np.array(l)[top_idxs])

            hp_res = dict(fold_idx=fold_i,
                          train=dict_map(top_statistics, hp_train_results),
                          test=dict_map(top_statistics, hp_test_results),
                          select=np.mean(selection_vals),
                          hp_i=hp_i,
                          hp=hps[hp_i],
                          select_repeats=len(selection_vals),
                          eval_repeats=len(files))

            if (best_res is None or
                (best_goal == "max" and best_res["select"] < hp_res["select"])
                    or
                (best_goal == "min" and best_res["select"] > hp_res["select"])
                    or
                (best_res["select"] == hp_res["select"]
                 and best_res["eval_repeats"] < hp_res["eval_repeats"])):
                best_res = hp_res

        if best_res is not None:
            results.append(best_res)
        else:
            print(f"No results for {fold_i}.")

    combined_train = dict_map(
        statistics,
        fy.merge_with(
            np.array,
            *map(lambda res: dict_map(lambda t: t["mean"], res["train"]),
                 results)))
    combined_test = dict_map(
        statistics,
        fy.merge_with(
            np.array,
            *map(lambda res: dict_map(lambda t: t["mean"], res["test"]),
                 results)))

    results_summary = {
        "folds": results,
        "combined_train": combined_train,
        "combined_test": combined_test,
        "args": {
            "ignore_worst": ignore_worst
        },
        "done": all_hps and len(folds) == 10
    }

    with open(summary_dir / "results.json", "w") as f:
        json.dump(results_summary, f, cls=NumpyEncoder, indent="\t")

    return results_summary
def group(events):
    return group_by(lambda x: x['repo'], events)
Beispiel #34
0
def transform(ast):
    return group_by(categorize, ast.walk_down())
def export_twitterUser_emotion_analysis(db='UserPost',collection="user_post"):
    client = MongoClient()
    db_tweets = client['%s' % db]
    collect_tweets = db_tweets['%s' % collection]
    db_user = client['Twitter']
    collect_user = db_user['twitter']
    from funcy import flatten,concat,group_by
    # 根据现有的文章提取出用户
    pipline = [
        {"$match": {
            "site": "twitter"
        }},
        {"$group": {
            "_id": "$user.id_str",
            "count": {"$sum": 1}
        }}
    ]
    result = list(collect_tweets.aggregate(pipline))
    formatDocs = []
    for id in list(map(lambda x: x['_id'], result)):
        # 查找该永和的用户信息
        user_for_id = collect_user.find_one({'id_str': id})
        # 查找该用户下的所有文章
        user_for_id_tweets_count = collect_tweets.count({"user.id_str": id, "site": 'twitter'})
        # print(user_for_id_tweets_count)
        if (user_for_id_tweets_count > 0):
            aggregate_for_user_tweets = collect_tweets.aggregate([
                {
                    "$match": {
                        "user.id_str": id,
                        "site": 'twitter'
                    }
                },
                {"$group": {
                    "_id": "$user.id_str",
                    "text":{"$push":"$text"}
                }}
            ])

            user_tweets_texts = list(aggregate_for_user_tweets)[0]
            # print(len(user_tweets_texts['text']))

            # print(texts)
            if len(user_tweets_texts['text'])>300:
                ops = [{'url':'https://tone-analyzer-demo.ng.bluemix.net/api/tone','data':''.join(user_tweets_texts['text'][i:i+300])} for i in range(0,len(user_tweets_texts['text']),300)]
            else:
                texts = ''.join(user_tweets_texts['text'])
                ops = [{'url':'https://tone-analyzer-demo.ng.bluemix.net/api/tone','data':texts}]
            # print(ops)
            analyzer = asynchronous_request_facebook_api(ops)
            # print(analyzer[0])
            final_result  = list(concat(list(flatten(list(map(lambda x:x['document_tone']['tones'],analyzer))))))
            group_result = group_by(lambda x:x['tone_name'],final_result)
            
            formatDocs.append({})
            print(len(formatDocs))
        else:
            print(id)
    df2 = pd.DataFrame(formatDocs)
    df2 = df2.applymap(lambda x: x.encode('unicode_escape').
                       decode('utf-8') if isinstance(x, str) else x)
    # print(docs)
    df2.to_excel('./export_data/%s/user_summary/%s.xlsx' % ("twitter", "twitter_user_summary"),
                 sheet_name='Sheet1')
Beispiel #36
0
_MONTHS = [
    'январь', 'февраль', 'март', 'апрель', 'май', 'июнь', 'июль', 'август',
    'сентябрь', 'октябрь', 'ноябрь', 'декабрь'
]

# предметы
_SUBJECTS = [
    'английский', 'алгебра', 'wolfram mathematica', 'мат. анализ',
    'дискр. мат', 'диффуры'
]

# фото
pic_path = os.listdir(path='pictures')
all_pic = [
    list(map(lambda x: 'pictures/' + x, i))
    for i in list(group_by(0, pic_path).values())
]
pic_category = ['hello', 'bye', 'level']
_PICTURES = dict(zip(pic_category, all_pic))

# дни рождения
with open('data/birth.txt', 'r', encoding='utf-8') as birthdays:
    birthdays_list = birthdays.readlines()

# фильмы
with open('data/films.txt', 'r', encoding='utf-8') as films:
    film_list = films.readlines()

# олипипиадные задачки
with open('data/olimp.txt', 'r', encoding='utf-8') as olimp:
    olimp_list = olimp.readlines()
Beispiel #37
0
def group_by_attr(attr: str, it: Iterable):
    return dict(group_by(operator.attrgetter(attr), it))