async def _req_frags_ngramms_cocitauthors(request: web.Request) -> web.StreamResponse: """Кросс-распределение «фразы» - «со-цитирования»""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) if not topn: topn = 10 contexts = mdb.contexts topn_authors:int = getreqarg_int(request, 'topn_cocitauthors') if topn_authors: topNa = await _get_topn_cocit_authors( contexts, topn_authors, include_conts=False) exists = frozenset(t for t, _ in topNa) else: exists = () nka:int = getreqarg_int(request, 'nka') ltype:str = getreqarg(request, 'ltype') n_gramms = mdb.n_gramms top_ngramms = await _get_topn_ngramm( n_gramms, nka, ltype, topn, title_always_id=True, show_type=True) out_dict = [] for i, (ngramm, typ_, cnt, conts) in enumerate(top_ngramms, 1): frags = Counter() congr = defaultdict(Counter) cnt = 0 async for doc in contexts.aggregate([ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, {'$project': {'prefix': False, 'suffix': False, 'exact': False}}, {'$unwind': '$cocit_authors'}, ]): ngr = doc['cocit_authors'] if topn_authors and ngr not in exists: continue cnt += 1 fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 if cnt: crosscocitaith = {} # out_dict[ngramm] = dict( # sum=cnt, frags=frags, cocitaithors=crosscocitaith) out_dict.append(dict( title=ngramm.split('_', 1)[-1], type=typ_, sum=cnt, frags=frags, cocitaithors=crosscocitaith)) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): crosscocitaith[co] = dict(frags=cnts, sum=sum(cnts.values())) return json_response(out_dict)
async def _reg_cnt_ngramm(request: web.Request) -> web.StreamResponse: app = request.app mdb = app['db'] topn = getreqarg_topn(request) nka = getreqarg_int(request, 'nka') ltype = getreqarg(request, 'ltype') if nka or ltype: pipeline = [get_ngramm_filter(nka, ltype)] else: pipeline = [] pipeline += [{'$sort': {'count_all': -1, 'title': 1, 'type': 1}}] if topn: pipeline += [{'$limit': topn}] out = [] get_as_tuple = itemgetter('title', 'type', 'linked_papers') n_gramms = mdb.n_gramms async for doc in n_gramms.aggregate(pipeline): title, lt, conts = get_as_tuple(doc) res = dict(title=title) if ltype else dict(title=title, type=lt) cnt_all = cnt_cont = 0 pubs = set() for cid, cnt in (c.values() for c in conts): cnt_cont += 1 cnt_all += cnt pubs.add(cid.rsplit('@', 1)[0]) res.update( count_all=doc['count_all'], count=cnt_all, count_conts=cnt_cont, conts_pubs=len(pubs)) out.append(res) return json_response(out)
async def _req_frags_ngramms_topics(request: web.Request) -> web.StreamResponse: """Кросс-распределение «фразы» - «топики контекстов цитирований»""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) if not topn: topn = 10 topn_topics:int = getreqarg_int(request, 'topn_topics') if topn_topics: topics = mdb.topics topNt = await _get_topn_topics(topics, topn=topn_topics) exists = frozenset(t for t, _, _ in topNt) else: exists = () nka:int = getreqarg_nka(request) ltype:str = getreqarg_ltype(request) n_gramms = mdb.n_gramms top_ngramms = await _get_topn_ngramm( n_gramms, nka, ltype, topn, title_always_id=True, show_type=True) contexts = mdb.contexts out_dict = [] zerro_frags = {n: 0 for n in range(1, 6)} for i, (ngrmm, typ_, cnt, conts) in enumerate(top_ngramms, 1): frags = Counter(zerro_frags) congr = defaultdict(partial(Counter, zerro_frags)) pipeline = [ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, {'$project': { 'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0, 'bundles': 0, 'linked_papers_ngrams': 0}}, {'$unwind': '$linked_papers_topics'}, ] # _logger.debug('ngrmm: "%s", cnt: %s, pipeline: %s', ngrmm, cnt, pipeline) async for doc in contexts.aggregate(pipeline): cont = doc['linked_papers_topics'] topic = cont['_id'] fnum = doc['frag_num'] frags[fnum] += 1 congr[topic][fnum] += 1 crosstopics = {} out_dict.append(dict( title=ngrmm.split('_', 1)[-1], type=typ_, sum=cnt, frags=frags, crosstopics=crosstopics)) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): crosstopics[co] = dict(frags=cnts, sum=sum(cnts.values())) return json_response(out_dict)
async def _req_frags_cocitauthors_topics(request: web.Request) -> web.StreamResponse: """Кросс-распределение «со-цитирования» - «топики контекстов цитирований»""" app = request.app mdb = app['db'] topn:int = getreqarg_topn(request) topn_topics:int = getreqarg_int(request, 'topn_topics') if topn_topics: topics = mdb.topics top_topics = await _get_topn_topics(topics, topn=topn) exists = frozenset(t for t, _, _ in top_topics) else: exists = () contexts = mdb.contexts topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True) pipeline = [ {'$project': { 'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0, 'bundles': 0, 'linked_papers_ngrams': 0, 'cocit_authors': 0}}, {'$unwind': '$linked_papers_topics'}, {'$lookup': { 'from': 'topics', 'localField': 'linked_papers_topics._id', 'foreignField': '_id', 'as': 'cont'}}, {'$unwind': '$cont'}, {'$match': {'$expr': {'$eq': ['$cont._id', '$linked_papers_topics._id']}}}, ] out_dict = {} for i, (cocitauthor, cnt, conts) in enumerate(topN, 1): frags = Counter() congr = defaultdict(Counter) work_pipeline = [ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, ] + pipeline # _logger.debug('cocitauthor: "%s", cnt: %s, pipeline: %s', cocitauthor, cnt, work_pipeline) async for doc in contexts.aggregate(work_pipeline): cont = doc['cont'] topic = cont['title'] if topn_topics and topic not in exists: continue fnum = doc['frag_num'] frags[fnum] += 1 congr[topic][fnum] += 1 crosstopics = {} out_dict[cocitauthor] = dict(sum=cnt, frags=frags, crosstopics=crosstopics) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): crosstopics[co] = dict(frags=cnts, sum=sum(cnts.values())) return json_response(out_dict)
async def _req_frags_topics_cocitauthors( request: web.Request ) -> web.StreamResponse: """Кросс-распределение «топики» - «со-цитирования»""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) contexts = mdb.contexts topn_authors:int = getreqarg_int(request, 'topn_cocitauthors') if topn_authors: topNa = await _get_topn_cocit_authors( contexts, topn_authors, include_conts=False) exists = frozenset(t for t, _ in topNa) else: exists = () topics = mdb.topics topN = await _get_topn_topics(topics, topn=topn) out_dict = {} for i, (topic, cnt, conts) in enumerate(topN, 1): frags = Counter() congr = defaultdict(Counter) async for doc in contexts.aggregate([ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, {'$project': {'prefix': False, 'suffix': False, 'exact': False}}, {'$unwind': '$cocit_authors'}, ]): ngr = doc['cocit_authors'] if topn_authors and ngr not in exists: continue fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 crosscocitaith = {} out_dict[topic] = dict(sum=cnt, frags=frags, cocitaithors=crosscocitaith) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): crosscocitaith[co] = dict(frags=cnts, sum=sum(cnts.values())) return json_response(out_dict)
async def _req_top_ngramm(request: web.Request) -> web.StreamResponse: """Топ N фраз""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) nka = getreqarg_int(request, 'nka') ltype = getreqarg(request, 'ltype') n_gramms = mdb.n_gramms topN = await _get_topn_ngramm(n_gramms, nka, ltype, topn) out = tuple(dict(title=n, contects=conts) for n, _, conts in topN) return json_response(out)
async def _reg_cnt_pubs_ngramm(request: web.Request) -> web.StreamResponse: app = request.app mdb = app['db'] topn = getreqarg_topn(request) nka = getreqarg_int(request, 'nka') ltype = getreqarg(request, 'ltype') if nka or ltype: pipeline = [get_ngramm_filter(nka, ltype)] else: pipeline = [] n_gramms = mdb.n_gramms publications:Collection = mdb.publications out = {} get_as_tuple = itemgetter('title', 'type', 'linked_papers') async for pobj in publications.find({'name': {'$exists': True}}): pub_id = pobj['_id'] pipeline_work = [ {'$match': {'linked_papers.cont_id': {'$regex': f'^{pub_id}@'}}} ] + pipeline out_ngrs = [] cont_starts = pub_id + '@' async for obj in n_gramms.aggregate(pipeline_work): title, lt, conts = get_as_tuple(obj) res = dict(title=title) if ltype else dict(title=title, type=lt) res.update(count_all=obj['count_all']) cnt_all = cnt_cont = 0 for cid, cnt in (c.values() for c in conts): if cid.startswith(cont_starts): cnt_cont += 1 cnt_all += cnt res.update(count=cnt_all, count_conts=cnt_cont, # conts=conts ) out_ngrs.append(res) out_ngrs = sorted(out_ngrs, key=itemgetter('count'), reverse=True) if topn: out_ngrs = out_ngrs[:topn] out[pub_id] = out_ngrs return json_response(out)
async def _req_top_ngramm_pubs(request: web.Request) -> web.StreamResponse: """Топ N фраз по публикациям""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) nka = getreqarg_int(request, 'nka') ltype = getreqarg(request, 'ltype') if nka or ltype: pipeline = [get_ngramm_filter(nka, ltype)] else: pipeline = [] pipeline += [ {'$unwind': '$linked_papers'}, {'$group': { '_id': '$title', 'count': {'$sum': '$linked_papers.cnt'}, 'conts': { '$addToSet': { 'cont_id': '$linked_papers.cont_id', 'cnt': '$linked_papers.cnt'}}}}, {'$sort': {'count': -1, '_id': 1}}, ] if topn: pipeline += [{'$limit': topn}] n_gramms = mdb.n_gramms get_as_tuple = itemgetter('_id', 'count', 'conts') topN = [get_as_tuple(obj) async for obj in n_gramms.aggregate(pipeline)] get_pubs = itemgetter('cont_id', 'cnt') out = { name: dict( all=cnt, contects=Counter( p for p, n in ( (c.rsplit('@', 1)[0], n) for c, n in (get_pubs(co) for co in conts)) for _ in range(n) )) for name, cnt, conts in topN} return json_response(out)
async def _req_frags_cocitauthors_ngramms(request: web.Request) -> web.StreamResponse: """Б Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»""" app = request.app mdb = app['db'] topn:int = getreqarg_topn(request) # if not topn: # topn = 10 topn_gramm:int = getreqarg_int(request, 'topn_gramm') if not topn_gramm: topn_gramm = 500 nka:int = getreqarg_nka(request) ltype:str = getreqarg_ltype(request) if topn_gramm: n_gramms = mdb.n_gramms top_ngramms = await _get_topn_ngramm( n_gramms, nka, ltype, topn_gramm, title_always_id=True) exists = frozenset(t for t, _, _ in top_ngramms) else: exists = () contexts = mdb.contexts topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True) pipeline = [ {'$project': { 'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0, 'bundles': 0, 'linked_papers_topics': 0}}, {'$lookup': { 'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id', 'foreignField': '_id', 'as': 'cont'}}, {'$unwind': '$cont'}, ] if nka or ltype: pipeline += [get_ngramm_filter(nka, ltype, 'cont')] pipeline += [ {'$unwind': '$linked_papers_ngrams'}, {'$match': {'$expr': {'$eq': ['$cont._id', '$linked_papers_ngrams._id']}}}, ] out_dict = {} for i, (cocitauthor, cnt, conts) in enumerate(topN, 1): frags = Counter() congr = defaultdict(Counter) titles = {} types = {} work_pipeline = [ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, ] + pipeline # _logger.debug('cocitauthor: "%s", cnt: %s, pipeline: %s', cocitauthor, cnt, work_pipeline) async for doc in contexts.aggregate(work_pipeline): cont = doc['cont'] ngr_title = cont['title'] ngr_id = cont['_id'] if topn_gramm and ngr_id not in exists: continue fnum = doc['frag_num'] frags[fnum] += 1 ngr_cnt = doc['linked_papers_ngrams']['cnt'] congr[ngr_id][fnum] += ngr_cnt titles[ngr_id] = ngr_title types[ngr_id] = cont['type'] crossgrams = [] out_dict[cocitauthor] = dict(sum=cnt, frags=frags, crossgrams=crossgrams) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): ngr = dict( title=titles[co], type=types[co], frags=cnts, sum=sum(cnts.values())) crossgrams.append(ngr) return json_response(out_dict)
async def _req_publ_cocitauthors_cocitauthors( request: web.Request ) -> web.StreamResponse: """ А Кросс-распределение «публикации» - «со-цитируемые авторы» """ app = request.app mdb = app['db'] topn = getreqarg_topn(request) topn_cocitauthors: int = getreqarg_int(request, 'topn_cocitauthors') contexts = mdb.contexts topN = await _get_topn_cocit_authors(contexts, topn) exists = () if topn_cocitauthors: if not topn or topn >= topn_cocitauthors: exists = frozenset(map(itemgetter(0), topN[:topn_cocitauthors])) else: topNa = await _get_topn_cocit_authors(contexts, topn_cocitauthors) exists = frozenset(map(itemgetter(0), topNa)) out_list = [] for i, (cocitauthor, _) in enumerate(topN, 1): cnt = 0 pubs = set() coaut = defaultdict(set) async for doc in contexts.find( {'cocit_authors': cocitauthor, 'frag_num': {'$gt': 0}}, projection=['pub_id', 'cocit_authors'] ).sort('frag_num'): # print(i, doc) if exists: coauthors = frozenset( c for c in doc['cocit_authors'] if c != cocitauthor and c in exists) else: coauthors = frozenset( c for c in doc['cocit_authors'] if c != cocitauthor) if not coauthors: continue cnt += 1 pub_id = doc['pub_id'] pubs.add(pub_id) for ca in coauthors: coaut[ca].add(pub_id) if not coaut: continue out_cocitauthors = [] out_list.append( dict( title=cocitauthor, cnt_pubs=len(pubs), cnt_cross=len(coaut), pubs=tuple(sorted(pubs)), cocitauthors=out_cocitauthors)) for j, (co, vals) in enumerate( sorted(coaut.items(), key=lambda kv: (-len(kv[1]), kv[0])), 1 ): out_cocitauthors.append( dict(title=co, cnt_pubs=len(vals), pubs=tuple(sorted(vals)))) return json_response(out_list)
async def _req_frags_topics_ngramms(request: web.Request) -> web.StreamResponse: """Кросс-распределение «топики» - «фразы»""" app = request.app mdb = app['db'] topn = getreqarg_topn(request) topn_crpssgramm: int = getreqarg_int(request, 'topn_crpssgramm') topn_gramm: int = getreqarg_int(request, 'topn_gramm') if not topn_gramm: topn_gramm = 500 nka = getreqarg_nka(request) ltype = getreqarg_ltype(request) if topn_gramm: n_gramms = mdb.n_gramms top_ngramms = await _get_topn_ngramm( n_gramms, nka, ltype, topn_gramm, title_always_id=True) exists = frozenset(t for t, _, _ in top_ngramms) else: exists = () pipeline = [ {'$project': { 'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0, 'bundles': 0, 'linked_papers_topics': 0}}, {'$unwind': '$linked_papers_ngrams'}, {'$lookup': { 'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id', 'foreignField': '_id', 'as': 'ngrm'}}, {'$unwind': '$ngrm'}, ] if nka or ltype: pipeline += [get_ngramm_filter(nka, ltype, 'ngrm')] pipeline += [ {'$sort': {'ngrm.count_in_linked_papers': -1, 'ngrm.count_all': -1}}, ] top_topics = await _get_topn_topics(mdb.topics, topn) contexts = mdb.contexts out_dict = {} zerro_frags = {n: 0 for n in range(1, 6)} for i, (topic, cnt, conts) in enumerate(top_topics, 1): frags = Counter(zerro_frags) congr = defaultdict(partial(Counter, zerro_frags)) work_pipeline = [ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, ] + pipeline # _logger.debug('topic: "%s", cnt: %s, pipeline: %s', topic, cnt, work_pipeline) async for doc in contexts.aggregate(work_pipeline): cont = doc['ngrm'] ngr = cont['title'] if exists and cont['_id'] not in exists: continue fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 if topn_crpssgramm and len(congr) == topn_crpssgramm: break crossgrams = {} out_dict[topic] = dict(sum=cnt, frags=frags, crossgrams=crossgrams) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1 ): crossgrams[co] = dict(frags=cnts, sum=sum(cnts.values())) return json_response(out_dict)