Esempio n. 1
0
 def handle(self, *args, **options):
     # django 1.6/1.7 takes cmd args as `args`, 1.8+ as `options`
     # sum their values to support both ways
     fields = distinct(list(args) + options.get('field', []))
     groups = group_values(f.rsplit('.', 1) for f in fields)
     for model_name, field_names in groups.items():
         self.migrate_fields(model_name, field_names)
    def handle(self, *args, **options):
        group = group_values(
            [convert_date(attr),
             attr.get('pubmed_id', '').split('|\n|')]
            for attr in Series.objects.values_list('attrs', flat=True))

        uniq_pmids = set([])

        def count_uniq_pmids(pmids):
            uniq_pmids.update(set(flatten(pmids)))
            return len(uniq_pmids)

        pmids = dict(walk_values(count_uniq_pmids, sorted(group.items())))

        delta = CURRENT_DATE - START_DATE
        keys = sorted(
            set(
                ceil_date(START_DATE + timedelta(days=index * 20))
                for index in range(delta.days / 20 + 1)))

        for index, date in enumerate(keys):
            hc = HistoricalCounter.objects.filter(created_on=date).first()
            if not hc:
                continue
            hc.counters['PMID'] = get_value(keys, index)(pmids)
            hc.save()
Esempio n. 3
0
def adj_list(concept_class, parallel=True):
    if parallel:
        from pathos.multiprocessing import ProcessingPool
        pool = ProcessingPool()
        mapper = pool.map
    else:
        mapper = map

    edge_generator = fn.cat(mapper(get_edges, possible_edges(concept_class)))
    edge_lists = fn.walk_values(set, fn.group_values(edge_generator))
    return defaultdict(set, edge_lists)
Esempio n. 4
0
def mygene_fetch(platform, probes, scopes):
    """Queries mygene.info for current entrezid and sym, given an identifier."""
    if scopes == "dna":
        probes = get_dna_probes(platform, probes)
        scopes = "accession"

    def extract_queries(lines):
        lines = remove(r'^(IMAGE:\d+|--[\w>-]+)$', lines)
        queries = cat(re_iter(r'[\w+.-]+', l) for l in lines)
        queries = remove(r'_at$|^\d+-\d+$', queries)  # No such thing
        return queries
        # Clean unicode for mygene
        # http://stackoverflow.com/questions/15321138/removing-unicode-u2026-like-characters
        return [
            q.decode('unicode_escape').encode('ascii', 'ignore')
            for q in queries
        ]

    _by_probe = group_values(probes.items())
    queries_by_probe = walk_values(extract_queries, _by_probe)

    # Collect all possible queries to make a single request to mygene
    queries = set(cat(queries_by_probe.values()))

    if not queries:
        return []
    mygenes = _mygene_fetch(queries, scopes, platform.specie)

    # Form results into rows
    results = []
    dups = 0
    for probe, queries in queries_by_probe.items():
        matches = ldistinct(keep(mygenes.get, queries))
        # Skip dups
        if len(matches) > 1:
            dups += 1
        elif matches:
            entrez, sym = matches[0]
            results.append({
                'probe': probe,
                'mygene_sym': sym,
                'mygene_entrez': entrez
            })
    if dups:
        cprint('-> Produced %d dups' % dups, 'red')
    return results
def calculate_m_estimates(X, y, alpha=1.0):
    total_counts = sum_word_vectors(X)
    vocabulary = total_counts.keys()
    m = len(vocabulary) * alpha
    p = 1 / len(vocabulary)
    n = sum(total_counts.values())

    grouped_counts = {
        k: sum_word_vectors(v)
        for k, v in funcy.group_values(zip(y, X)).items()
    }
    counts_as_series = {
        k: pd.Series(v).reindex(vocabulary, fill_value=0) 
        for k, v in grouped_counts.items()
    }
    likelihood_m_estimates = {
        k: m_estimate(v, p, m, n)
        for k, v in counts_as_series.items()
    }

    return likelihood_m_estimates
Esempio n. 6
0
def rebundle_names(names):
    grouped_names = fn.group_values(map(unpack_name, names))
    return BundleMap(pmap(fn.walk_values(to_size, grouped_names)))
def distribute_series_and_sample_annotations(qs):
    series_annotations = distribute_by_created_on(qs)
    values = qs.values_list('created_on', 'samples')
    group = group_values(walk_keys(ceil_date, values.iterator()))
    return series_annotations, accumulate(walk_values(sum, group))
def distribute_by_user_id(qs):
    data = group_values(qs.values_list('created_by_id', 'created_on'))
    return walk_values(lambda dates: accumulate(count_by(ceil_date, dates)),
                       data)
    def handle(self, *args, **options):
        series = {}
        samples = {}

        platform_created_on = join_with(
            min, [{p: ceil_attrs_date(s)
                   for p in s.platforms} for s in Series.objects.all()])
        platform_qs = Platform.objects.annotate(probes_count=Count('probes'))\
            .values('gpl_name', 'probes_count')
        platforms = {}
        platforms_probes = {}

        series_annotations = {}
        sample_annotations = {}
        concordant_series_annotations = {}
        concordant_sample_annotations = {}

        series_tags = {}
        concordant_series_tags = {}
        sample_tags = {}
        concordant_sample_tags = {}

        series_validations = {}
        sample_validations = {}
        concordant_series_validations = {}
        concordant_sample_validations = {}

        for specie in SPECIES.values():
            series[specie] = accumulate(
                count_by(ceil_attrs_date,
                         Series.objects.filter(specie=specie)))

            qs = Sample.objects.filter(platform__specie=specie)
            iterator = tqdm(queryset_iterator(qs, 30000),
                            total=qs.count(),
                            desc='{0} samples'.format(specie))
            samples[specie] = accumulate(count_by(ceil_attrs_date, iterator))

            platforms_data = [[
                platform_created_on[item['gpl_name']], item['probes_count']
            ] for item in platform_qs.filter(specie=specie)]
            platforms[specie] = accumulate(count_by(first, platforms_data))
            group = group_values(platforms_data)
            platforms_probes[specie] = accumulate(walk_values(sum, group))

            qs = SeriesAnnotation.objects.filter(series__specie=specie)
            series_annotations[specie], \
                sample_annotations[specie] = distribute_series_and_sample_annotations(qs)

            concordant_series_annotations[specie], \
                concordant_sample_annotations[specie] = distribute_series_and_sample_annotations(
                    qs.filter(best_cohens_kappa=1))

            qs = SeriesTag.objects.filter(platform__specie=specie,
                                          is_active=True)
            series_tags[specie] = distribute_by_created_on(qs)
            concordant_series_tags[specie] = distribute_by_created_on(
                qs.exclude(agreed=None))

            qs = SampleTag.objects.filter(sample__platform__specie=specie,
                                          is_active=True)
            sample_tags[specie] = distribute_by_created_on(qs)
            concordant_sample_tags[specie] = distribute_by_created_on(
                qs.exclude(series_tag__agreed=None))

            qs = SerieValidation.objects.filter(platform__specie=specie,
                                                ignored=False,
                                                by_incompetent=False)
            series_validations[specie] = distribute_by_created_on(qs)
            concordant_series_validations[specie] = distribute_by_created_on(
                qs.filter(best_kappa=1))

            qs = SampleValidation\
                .objects\
                .filter(sample__platform__specie=specie,
                        serie_validation__ignored=False,
                        serie_validation__by_incompetent=False)
            sample_validations[specie] = distribute_by_created_on(qs)
            concordant_sample_validations[specie] = distribute_by_created_on(
                qs.filter(
                    Q(serie_validation__best_kappa=1) | Q(concordant=True)))

        users = accumulate(
            count_by(ceil_date,
                     User.objects.values_list('date_joined', flat=True)))
        tags = accumulate(
            count_by(ceil_date, Tag.objects.values_list('created_on',
                                                        flat=True)))

        delta = CURRENT_DATE - START_DATE
        keys = sorted(
            set(
                ceil_date(START_DATE + timedelta(days=index * 20))
                for index in range(delta.days / 20 + 1)))

        specie_data = {
            'series':
            series,
            'samples':
            samples,
            'platforms':
            platforms,
            'platforms_probes':
            platforms_probes,
            'series_annotations':
            series_annotations,
            'sample_annotations':
            sample_annotations,
            'concordant_series_annotations':
            concordant_series_annotations,
            'concordant_sample_annotations':
            concordant_sample_annotations,
            'series_tags':
            series_tags,
            'sample_tags':
            sample_tags,
            'concordant_series_tags':
            concordant_series_tags,
            'concordant_sample_tags':
            concordant_sample_tags,
            'series_validations':
            series_validations,
            'sample_validations':
            sample_validations,
            'concordant_series_validations':
            concordant_series_validations,
            'concordant_sample_validations':
            concordant_sample_validations,
            'series_tags_by_users':
            distribute_by_user_id(SeriesTag.objects.filter(is_active=True)),
            'sample_tags_by_users':
            distribute_by_user_id(SampleTag.objects.filter(is_active=True)),
            'series_validations_by_users':
            distribute_by_user_id(
                SerieValidation.objects.filter(ignored=False,
                                               by_incompetent=False)),
            'sample_validations_by_users':
            distribute_by_user_id(
                SampleValidation.objects.filter(
                    serie_validation__ignored=False,
                    serie_validation__by_incompetent=False)),
            'series_tag_history':
            get_series_tag_history(),
        }

        data = {
            'users': users,
            'tags': tags,
        }

        with transaction.atomic():
            HistoricalCounter.objects.filter(
                created_on__lte=CURRENT_DATE).delete()
            HistoricalCounter.objects.bulk_create([
                HistoricalCounter(created_on=key,
                                  counters=merge(
                                      walk_values(get_value(keys, index),
                                                  data),
                                      walk_values(
                                          lambda value: walk_values(
                                              get_value(keys, index), value),
                                          specie_data)))
                for index, key in enumerate(keys)
            ])