async def _reg_cnt_ngramm(request: web.Request) -> web.StreamResponse: app = request.app mdb = app['db'] topn = getreqarg_topn(request) nka = getreqarg_int(request, 'nka') ltype = getreqarg(request, 'ltype') if nka or ltype: pipeline = [get_ngramm_filter(nka, ltype)] else: pipeline = [] pipeline += [{'$sort': {'count_all': -1, 'title': 1, 'type': 1}}] if topn: pipeline += [{'$limit': topn}] out = [] get_as_tuple = itemgetter('title', 'type', 'linked_papers') n_gramms = mdb.n_gramms async for doc in n_gramms.aggregate(pipeline): title, lt, conts = get_as_tuple(doc) res = dict(title=title) if ltype else dict(title=title, type=lt) cnt_all = cnt_cont = 0 pubs = set() for cid, cnt in (c.values() for c in conts): cnt_cont += 1 cnt_all += cnt pubs.add(cid.rsplit('@', 1)[0]) res.update( count_all=doc['count_all'], count=cnt_all, count_conts=cnt_cont, conts_pubs=len(pubs)) out.append(res) return json_response(out)
async def _reg_cnt_pubs_ngramm(request: web.Request) -> web.StreamResponse: app = request.app mdb = app['db'] topn = getreqarg_topn(request) nka = getreqarg_int(request, 'nka') ltype = getreqarg(request, 'ltype') if nka or ltype: pipeline = [get_ngramm_filter(nka, ltype)] else: pipeline = [] n_gramms = mdb.n_gramms publications:Collection = mdb.publications out = {} get_as_tuple = itemgetter('title', 'type', 'linked_papers') async for pobj in publications.find({'name': {'$exists': True}}): pub_id = pobj['_id'] pipeline_work = [ {'$match': {'linked_papers.cont_id': {'$regex': f'^{pub_id}@'}}} ] + pipeline out_ngrs = [] cont_starts = pub_id + '@' async for obj in n_gramms.aggregate(pipeline_work): title, lt, conts = get_as_tuple(obj) res = dict(title=title) if ltype else dict(title=title, type=lt) res.update(count_all=obj['count_all']) cnt_all = cnt_cont = 0 for cid, cnt in (c.values() for c in conts): if cid.startswith(cont_starts): cnt_cont += 1 cnt_all += cnt res.update(count=cnt_all, count_conts=cnt_cont, # conts=conts ) out_ngrs.append(res) out_ngrs = sorted(out_ngrs, key=itemgetter('count'), reverse=True) if topn: out_ngrs = out_ngrs[:topn] out[pub_id] = out_ngrs return json_response(out)
async def _req_top_ngramm_pubs(request: web.Request) -> web.StreamResponse: """Топ N фраз по публикациям""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) nka = getreqarg_int(request, 'nka') ltype = getreqarg(request, 'ltype') if nka or ltype: pipeline = [get_ngramm_filter(nka, ltype)] else: pipeline = [] pipeline += [ {'$unwind': '$linked_papers'}, {'$group': { '_id': '$title', 'count': {'$sum': '$linked_papers.cnt'}, 'conts': { '$addToSet': { 'cont_id': '$linked_papers.cont_id', 'cnt': '$linked_papers.cnt'}}}}, {'$sort': {'count': -1, '_id': 1}}, ] if topn: pipeline += [{'$limit': topn}] n_gramms = mdb.n_gramms get_as_tuple = itemgetter('_id', 'count', 'conts') topN = [get_as_tuple(obj) async for obj in n_gramms.aggregate(pipeline)] get_pubs = itemgetter('cont_id', 'cnt') out = { name: dict( all=cnt, contects=Counter( p for p, n in ( (c.rsplit('@', 1)[0], n) for c, n in (get_pubs(co) for co in conts)) for _ in range(n) )) for name, cnt, conts in topN} return json_response(out)
async def _req_publ_ngramm_ngramm(request: web.Request) -> web.StreamResponse: """Кросс-распределение «публикации» - «фразы из контекстов цитирований»""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) if not topn: topn = 10 nka:int = getreqarg_nka(request) ltype:str = getreqarg_ltype(request) n_gramms = mdb.n_gramms topN = await _get_topn_ngramm( n_gramms, nka, ltype, topn, title_always_id=True, show_type=True) exists = frozenset(t for t, *_ in topN) out_list = [] contexts = mdb.contexts pipeline = [ {'$project': { 'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0, 'bundles': 0, 'linked_papers_topics': 0}}, {'$lookup': { 'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id', 'foreignField': '_id', 'as': 'cont'}}, {'$unwind': '$cont'}, ] if nka or ltype: pipeline += [get_ngramm_filter(nka, ltype, 'cont')] pipeline += [ {'$unwind': '$linked_papers_ngrams'}, {'$match': {'$expr': {'$eq': ['$linked_papers_ngrams._id', '$cont._id']}}}, ] for i, (ngrmm, typ_, cnt, conts) in enumerate(topN, 1): congr = defaultdict(set) titles = {} types = {} work_pipeline = [ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}} ] + pipeline # _logger.debug('ngrmm: "%s", cnt: %s, pipeline: %s', ngrmm, cnt, work_pipeline) async for doc in contexts.aggregate(work_pipeline): cont = doc['cont'] ngr_id = cont['_id'] if ngr_id not in exists: continue pub_id = doc['pub_id'] congr[ngr_id].add(pub_id) titles[ngr_id] = cont['title'] types[ngr_id] = cont['type'] pubs = congr.pop(ngrmm, None) if not pubs: continue crossgrams = [] oltype = ltype if ltype else types[ngrmm] out_list.append( dict( title=titles[ngrmm], type=oltype, pubs=tuple(sorted(pubs)), crossgrams=crossgrams, cnt_pubs=len(pubs), cnt_cross=len(congr))) enum_sort = enumerate( sorted(congr.items(), key=lambda kv: (-len(kv[1]), kv[0])), 1) for j, (co, vals) in enum_sort: crossgrams.append( dict( title=titles[co], type=types[co], cnt=len(vals), pubs=tuple(sorted(vals)))) return json_response(out_list)
async def _req_frags_ngramm(request: web.Request) -> web.StreamResponse: """Распределение «5 фрагментов» - «фразы из контекстов цитирований»""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) if not topn: topn = 10 nka:int = getreqarg_nka(request) ltype:str = getreqarg_ltype(request) pipeline = [ {'$match': { 'frag_num': {'$exists': 1}, 'linked_papers_ngrams': {'$exists': 1}}}, {'$project': { '_id': 1, 'frag_num': 1, 'linked_paper': '$linked_papers_ngrams'}}, {'$unwind': '$linked_paper'}, {'$group': { '_id': {'_id': '$linked_paper._id', 'frag_num': '$frag_num'}, 'count': {'$sum': '$linked_paper.cnt'},}}, {'$group': { '_id': '$_id._id', 'count': {'$sum': '$count'}, 'frags': {'$push': {'frag_num': '$_id.frag_num', 'count': '$count',}},}}, {'$sort': {'count': -1, '_id': 1}}, {'$lookup': { 'from': 'n_gramms', 'localField': '_id', 'foreignField': '_id', 'as': 'ngramm'}}, {'$unwind': '$ngramm'}, ] if nka or ltype: pipeline += [get_ngramm_filter(nka, ltype, 'ngramm')] pipeline += [ {'$project': { 'title': '$ngramm.title', 'type': '$ngramm.type', 'nka': '$ngramm.nka', 'count': '$count', 'frags': '$frags'}}] if topn: pipeline += [{'$limit': topn}] _logger.debug('pipeline: %s', pipeline) contexts = mdb.contexts out_dict = {} async for doc in contexts.aggregate(pipeline): title = doc['title'] cnt = doc['count'] frags = {n: 0 for n in range(1, 6)} frags.update(map(itemgetter('frag_num', 'count'), doc['frags'])) dtype = doc['type'] out = dict(sum=cnt, frags=frags) if not nka: out.update(nka=doc['nka']) if ltype: out_dict[title] = out else: out.update(type = dtype, title=title) did = doc['_id'] out_dict[did] = out return json_response(out_dict)
async def _req_frags_cocitauthors_ngramms(request: web.Request) -> web.StreamResponse: """Б Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»""" app = request.app mdb = app['db'] topn:int = getreqarg_topn(request) # if not topn: # topn = 10 topn_gramm:int = getreqarg_int(request, 'topn_gramm') if not topn_gramm: topn_gramm = 500 nka:int = getreqarg_nka(request) ltype:str = getreqarg_ltype(request) if topn_gramm: n_gramms = mdb.n_gramms top_ngramms = await _get_topn_ngramm( n_gramms, nka, ltype, topn_gramm, title_always_id=True) exists = frozenset(t for t, _, _ in top_ngramms) else: exists = () contexts = mdb.contexts topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True) pipeline = [ {'$project': { 'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0, 'bundles': 0, 'linked_papers_topics': 0}}, {'$lookup': { 'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id', 'foreignField': '_id', 'as': 'cont'}}, {'$unwind': '$cont'}, ] if nka or ltype: pipeline += [get_ngramm_filter(nka, ltype, 'cont')] pipeline += [ {'$unwind': '$linked_papers_ngrams'}, {'$match': {'$expr': {'$eq': ['$cont._id', '$linked_papers_ngrams._id']}}}, ] out_dict = {} for i, (cocitauthor, cnt, conts) in enumerate(topN, 1): frags = Counter() congr = defaultdict(Counter) titles = {} types = {} work_pipeline = [ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, ] + pipeline # _logger.debug('cocitauthor: "%s", cnt: %s, pipeline: %s', cocitauthor, cnt, work_pipeline) async for doc in contexts.aggregate(work_pipeline): cont = doc['cont'] ngr_title = cont['title'] ngr_id = cont['_id'] if topn_gramm and ngr_id not in exists: continue fnum = doc['frag_num'] frags[fnum] += 1 ngr_cnt = doc['linked_papers_ngrams']['cnt'] congr[ngr_id][fnum] += ngr_cnt titles[ngr_id] = ngr_title types[ngr_id] = cont['type'] crossgrams = [] out_dict[cocitauthor] = dict(sum=cnt, frags=frags, crossgrams=crossgrams) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): ngr = dict( title=titles[co], type=types[co], frags=cnts, sum=sum(cnts.values())) crossgrams.append(ngr) return json_response(out_dict)
async def _req_frags_topics_ngramms(request: web.Request) -> web.StreamResponse: """Кросс-распределение «топики» - «фразы»""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) topn_crpssgramm: int = getreqarg_int(request, 'topn_crpssgramm') topn_gramm: int = getreqarg_int(request, 'topn_gramm') if not topn_gramm: topn_gramm = 500 nka = getreqarg_nka(request) ltype = getreqarg_ltype(request) if topn_gramm: n_gramms = mdb.n_gramms top_ngramms = await _get_topn_ngramm( n_gramms, nka, ltype, topn_gramm, title_always_id=True) exists = frozenset(t for t, _, _ in top_ngramms) else: exists = () pipeline = [ {'$project': { 'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0, 'bundles': 0, 'linked_papers_topics': 0}}, {'$unwind': '$linked_papers_ngrams'}, {'$lookup': { 'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id', 'foreignField': '_id', 'as': 'ngrm'}}, {'$unwind': '$ngrm'}, ] if nka or ltype: pipeline += [get_ngramm_filter(nka, ltype, 'ngrm')] pipeline += [ {'$sort': {'ngrm.count_in_linked_papers': -1, 'ngrm.count_all': -1}}, ] top_topics = await _get_topn_topics(mdb.topics, topn) contexts = mdb.contexts out_dict = {} zerro_frags = {n: 0 for n in range(1, 6)} for i, (topic, cnt, conts) in enumerate(top_topics, 1): frags = Counter(zerro_frags) congr = defaultdict(partial(Counter, zerro_frags)) work_pipeline = [ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, ] + pipeline # _logger.debug('topic: "%s", cnt: %s, pipeline: %s', topic, cnt, work_pipeline) async for doc in contexts.aggregate(work_pipeline): cont = doc['ngrm'] ngr = cont['title'] if exists and cont['_id'] not in exists: continue fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 if topn_crpssgramm and len(congr) == topn_crpssgramm: break crossgrams = {} out_dict[topic] = dict(sum=cnt, frags=frags, crossgrams=crossgrams) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): crossgrams[co] = dict(frags=cnts, sum=sum(cnts.values())) return json_response(out_dict)