def handle(self, *args, **options): # django 1.6/1.7 takes cmd args as `args`, 1.8+ as `options` # sum their values to support both ways fields = distinct(list(args) + options.get('field', [])) groups = group_values(f.rsplit('.', 1) for f in fields) for model_name, field_names in groups.items(): self.migrate_fields(model_name, field_names)
def handle(self, *args, **options): group = group_values( [convert_date(attr), attr.get('pubmed_id', '').split('|\n|')] for attr in Series.objects.values_list('attrs', flat=True)) uniq_pmids = set([]) def count_uniq_pmids(pmids): uniq_pmids.update(set(flatten(pmids))) return len(uniq_pmids) pmids = dict(walk_values(count_uniq_pmids, sorted(group.items()))) delta = CURRENT_DATE - START_DATE keys = sorted( set( ceil_date(START_DATE + timedelta(days=index * 20)) for index in range(delta.days / 20 + 1))) for index, date in enumerate(keys): hc = HistoricalCounter.objects.filter(created_on=date).first() if not hc: continue hc.counters['PMID'] = get_value(keys, index)(pmids) hc.save()
def adj_list(concept_class, parallel=True): if parallel: from pathos.multiprocessing import ProcessingPool pool = ProcessingPool() mapper = pool.map else: mapper = map edge_generator = fn.cat(mapper(get_edges, possible_edges(concept_class))) edge_lists = fn.walk_values(set, fn.group_values(edge_generator)) return defaultdict(set, edge_lists)
def mygene_fetch(platform, probes, scopes): """Queries mygene.info for current entrezid and sym, given an identifier.""" if scopes == "dna": probes = get_dna_probes(platform, probes) scopes = "accession" def extract_queries(lines): lines = remove(r'^(IMAGE:\d+|--[\w>-]+)$', lines) queries = cat(re_iter(r'[\w+.-]+', l) for l in lines) queries = remove(r'_at$|^\d+-\d+$', queries) # No such thing return queries # Clean unicode for mygene # http://stackoverflow.com/questions/15321138/removing-unicode-u2026-like-characters return [ q.decode('unicode_escape').encode('ascii', 'ignore') for q in queries ] _by_probe = group_values(probes.items()) queries_by_probe = walk_values(extract_queries, _by_probe) # Collect all possible queries to make a single request to mygene queries = set(cat(queries_by_probe.values())) if not queries: return [] mygenes = _mygene_fetch(queries, scopes, platform.specie) # Form results into rows results = [] dups = 0 for probe, queries in queries_by_probe.items(): matches = ldistinct(keep(mygenes.get, queries)) # Skip dups if len(matches) > 1: dups += 1 elif matches: entrez, sym = matches[0] results.append({ 'probe': probe, 'mygene_sym': sym, 'mygene_entrez': entrez }) if dups: cprint('-> Produced %d dups' % dups, 'red') return results
def calculate_m_estimates(X, y, alpha=1.0): total_counts = sum_word_vectors(X) vocabulary = total_counts.keys() m = len(vocabulary) * alpha p = 1 / len(vocabulary) n = sum(total_counts.values()) grouped_counts = { k: sum_word_vectors(v) for k, v in funcy.group_values(zip(y, X)).items() } counts_as_series = { k: pd.Series(v).reindex(vocabulary, fill_value=0) for k, v in grouped_counts.items() } likelihood_m_estimates = { k: m_estimate(v, p, m, n) for k, v in counts_as_series.items() } return likelihood_m_estimates
def rebundle_names(names): grouped_names = fn.group_values(map(unpack_name, names)) return BundleMap(pmap(fn.walk_values(to_size, grouped_names)))
def distribute_series_and_sample_annotations(qs): series_annotations = distribute_by_created_on(qs) values = qs.values_list('created_on', 'samples') group = group_values(walk_keys(ceil_date, values.iterator())) return series_annotations, accumulate(walk_values(sum, group))
def distribute_by_user_id(qs): data = group_values(qs.values_list('created_by_id', 'created_on')) return walk_values(lambda dates: accumulate(count_by(ceil_date, dates)), data)
def handle(self, *args, **options): series = {} samples = {} platform_created_on = join_with( min, [{p: ceil_attrs_date(s) for p in s.platforms} for s in Series.objects.all()]) platform_qs = Platform.objects.annotate(probes_count=Count('probes'))\ .values('gpl_name', 'probes_count') platforms = {} platforms_probes = {} series_annotations = {} sample_annotations = {} concordant_series_annotations = {} concordant_sample_annotations = {} series_tags = {} concordant_series_tags = {} sample_tags = {} concordant_sample_tags = {} series_validations = {} sample_validations = {} concordant_series_validations = {} concordant_sample_validations = {} for specie in SPECIES.values(): series[specie] = accumulate( count_by(ceil_attrs_date, Series.objects.filter(specie=specie))) qs = Sample.objects.filter(platform__specie=specie) iterator = tqdm(queryset_iterator(qs, 30000), total=qs.count(), desc='{0} samples'.format(specie)) samples[specie] = accumulate(count_by(ceil_attrs_date, iterator)) platforms_data = [[ platform_created_on[item['gpl_name']], item['probes_count'] ] for item in platform_qs.filter(specie=specie)] platforms[specie] = accumulate(count_by(first, platforms_data)) group = group_values(platforms_data) platforms_probes[specie] = accumulate(walk_values(sum, group)) qs = SeriesAnnotation.objects.filter(series__specie=specie) series_annotations[specie], \ sample_annotations[specie] = distribute_series_and_sample_annotations(qs) concordant_series_annotations[specie], \ concordant_sample_annotations[specie] = distribute_series_and_sample_annotations( qs.filter(best_cohens_kappa=1)) qs = SeriesTag.objects.filter(platform__specie=specie, is_active=True) series_tags[specie] = distribute_by_created_on(qs) concordant_series_tags[specie] = distribute_by_created_on( qs.exclude(agreed=None)) qs = SampleTag.objects.filter(sample__platform__specie=specie, is_active=True) sample_tags[specie] = distribute_by_created_on(qs) concordant_sample_tags[specie] = distribute_by_created_on( qs.exclude(series_tag__agreed=None)) qs = SerieValidation.objects.filter(platform__specie=specie, ignored=False, by_incompetent=False) series_validations[specie] = distribute_by_created_on(qs) concordant_series_validations[specie] = distribute_by_created_on( qs.filter(best_kappa=1)) qs = SampleValidation\ .objects\ .filter(sample__platform__specie=specie, serie_validation__ignored=False, serie_validation__by_incompetent=False) sample_validations[specie] = distribute_by_created_on(qs) concordant_sample_validations[specie] = distribute_by_created_on( qs.filter( Q(serie_validation__best_kappa=1) | Q(concordant=True))) users = accumulate( count_by(ceil_date, User.objects.values_list('date_joined', flat=True))) tags = accumulate( count_by(ceil_date, Tag.objects.values_list('created_on', flat=True))) delta = CURRENT_DATE - START_DATE keys = sorted( set( ceil_date(START_DATE + timedelta(days=index * 20)) for index in range(delta.days / 20 + 1))) specie_data = { 'series': series, 'samples': samples, 'platforms': platforms, 'platforms_probes': platforms_probes, 'series_annotations': series_annotations, 'sample_annotations': sample_annotations, 'concordant_series_annotations': concordant_series_annotations, 'concordant_sample_annotations': concordant_sample_annotations, 'series_tags': series_tags, 'sample_tags': sample_tags, 'concordant_series_tags': concordant_series_tags, 'concordant_sample_tags': concordant_sample_tags, 'series_validations': series_validations, 'sample_validations': sample_validations, 'concordant_series_validations': concordant_series_validations, 'concordant_sample_validations': concordant_sample_validations, 'series_tags_by_users': distribute_by_user_id(SeriesTag.objects.filter(is_active=True)), 'sample_tags_by_users': distribute_by_user_id(SampleTag.objects.filter(is_active=True)), 'series_validations_by_users': distribute_by_user_id( SerieValidation.objects.filter(ignored=False, by_incompetent=False)), 'sample_validations_by_users': distribute_by_user_id( SampleValidation.objects.filter( serie_validation__ignored=False, serie_validation__by_incompetent=False)), 'series_tag_history': get_series_tag_history(), } data = { 'users': users, 'tags': tags, } with transaction.atomic(): HistoricalCounter.objects.filter( created_on__lte=CURRENT_DATE).delete() HistoricalCounter.objects.bulk_create([ HistoricalCounter(created_on=key, counters=merge( walk_values(get_value(keys, index), data), walk_values( lambda value: walk_values( get_value(keys, index), value), specie_data))) for index, key in enumerate(keys) ])