async def _req_frags_ngramms_cocitauthors(request: web.Request) -> web.StreamResponse: """Кросс-распределение «фразы» - «со-цитирования»""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) if not topn: topn = 10 contexts = mdb.contexts topn_authors:int = getreqarg_int(request, 'topn_cocitauthors') if topn_authors: topNa = await _get_topn_cocit_authors( contexts, topn_authors, include_conts=False) exists = frozenset(t for t, _ in topNa) else: exists = () nka:int = getreqarg_int(request, 'nka') ltype:str = getreqarg(request, 'ltype') n_gramms = mdb.n_gramms top_ngramms = await _get_topn_ngramm( n_gramms, nka, ltype, topn, title_always_id=True, show_type=True) out_dict = [] for i, (ngramm, typ_, cnt, conts) in enumerate(top_ngramms, 1): frags = Counter() congr = defaultdict(Counter) cnt = 0 async for doc in contexts.aggregate([ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, {'$project': {'prefix': False, 'suffix': False, 'exact': False}}, {'$unwind': '$cocit_authors'}, ]): ngr = doc['cocit_authors'] if topn_authors and ngr not in exists: continue cnt += 1 fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 if cnt: crosscocitaith = {} # out_dict[ngramm] = dict( # sum=cnt, frags=frags, cocitaithors=crosscocitaith) out_dict.append(dict( title=ngramm.split('_', 1)[-1], type=typ_, sum=cnt, frags=frags, cocitaithors=crosscocitaith)) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): crosscocitaith[co] = dict(frags=cnts, sum=sum(cnts.values())) return json_response(out_dict)
async def _reg_cnt_ngramm(request: web.Request) -> web.StreamResponse: app = request.app mdb = app['db'] topn = getreqarg_topn(request) nka = getreqarg_int(request, 'nka') ltype = getreqarg(request, 'ltype') if nka or ltype: pipeline = [get_ngramm_filter(nka, ltype)] else: pipeline = [] pipeline += [{'$sort': {'count_all': -1, 'title': 1, 'type': 1}}] if topn: pipeline += [{'$limit': topn}] out = [] get_as_tuple = itemgetter('title', 'type', 'linked_papers') n_gramms = mdb.n_gramms async for doc in n_gramms.aggregate(pipeline): title, lt, conts = get_as_tuple(doc) res = dict(title=title) if ltype else dict(title=title, type=lt) cnt_all = cnt_cont = 0 pubs = set() for cid, cnt in (c.values() for c in conts): cnt_cont += 1 cnt_all += cnt pubs.add(cid.rsplit('@', 1)[0]) res.update( count_all=doc['count_all'], count=cnt_all, count_conts=cnt_cont, conts_pubs=len(pubs)) out.append(res) return json_response(out)
async def _req_top_ngramm(request: web.Request) -> web.StreamResponse: """Топ N фраз""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) nka = getreqarg_int(request, 'nka') ltype = getreqarg(request, 'ltype') n_gramms = mdb.n_gramms topN = await _get_topn_ngramm(n_gramms, nka, ltype, topn) out = tuple(dict(title=n, contects=conts) for n, _, conts in topN) return json_response(out)
async def _reg_cnt_pubs_ngramm(request: web.Request) -> web.StreamResponse: app = request.app mdb = app['db'] topn = getreqarg_topn(request) nka = getreqarg_int(request, 'nka') ltype = getreqarg(request, 'ltype') if nka or ltype: pipeline = [get_ngramm_filter(nka, ltype)] else: pipeline = [] n_gramms = mdb.n_gramms publications:Collection = mdb.publications out = {} get_as_tuple = itemgetter('title', 'type', 'linked_papers') async for pobj in publications.find({'name': {'$exists': True}}): pub_id = pobj['_id'] pipeline_work = [ {'$match': {'linked_papers.cont_id': {'$regex': f'^{pub_id}@'}}} ] + pipeline out_ngrs = [] cont_starts = pub_id + '@' async for obj in n_gramms.aggregate(pipeline_work): title, lt, conts = get_as_tuple(obj) res = dict(title=title) if ltype else dict(title=title, type=lt) res.update(count_all=obj['count_all']) cnt_all = cnt_cont = 0 for cid, cnt in (c.values() for c in conts): if cid.startswith(cont_starts): cnt_cont += 1 cnt_all += cnt res.update(count=cnt_all, count_conts=cnt_cont, # conts=conts ) out_ngrs.append(res) out_ngrs = sorted(out_ngrs, key=itemgetter('count'), reverse=True) if topn: out_ngrs = out_ngrs[:topn] out[pub_id] = out_ngrs return json_response(out)
async def _req_top_ngramm_pubs(request: web.Request) -> web.StreamResponse: """Топ N фраз по публикациям""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) nka = getreqarg_int(request, 'nka') ltype = getreqarg(request, 'ltype') if nka or ltype: pipeline = [get_ngramm_filter(nka, ltype)] else: pipeline = [] pipeline += [ {'$unwind': '$linked_papers'}, {'$group': { '_id': '$title', 'count': {'$sum': '$linked_papers.cnt'}, 'conts': { '$addToSet': { 'cont_id': '$linked_papers.cont_id', 'cnt': '$linked_papers.cnt'}}}}, {'$sort': {'count': -1, '_id': 1}}, ] if topn: pipeline += [{'$limit': topn}] n_gramms = mdb.n_gramms get_as_tuple = itemgetter('_id', 'count', 'conts') topN = [get_as_tuple(obj) async for obj in n_gramms.aggregate(pipeline)] get_pubs = itemgetter('cont_id', 'cnt') out = { name: dict( all=cnt, contects=Counter( p for p, n in ( (c.rsplit('@', 1)[0], n) for c, n in (get_pubs(co) for co in conts)) for _ in range(n) )) for name, cnt, conts in topN} return json_response(out)