def print_top_ngramms_top_author_by_frags( mdb, topn:int, *, topn_authors:int=500, nka:int=2, ltype:str='lemmas' ): """Кросс-распределение «фразы» - «со-цитирования»""" print('Кросс-распределение «фразы» - «со-цитирования»') n_gramms = mdb.n_gramms top_ngramms = get_topn( n_gramms, topn, preselect=[{'$match': {'nka': nka, 'type': ltype}}], sum_expr='$linked_papers.cnt') contexts = mdb.contexts for i, (ngramm, cnt, conts) in enumerate(top_ngramms, 1): frags = Counter() congr = defaultdict(Counter) for doc in contexts.aggregate([ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, {'$project': {'prefix': False, 'suffix': False, 'exact': False}}, {'$unwind': '$cocit_authors'}, ]): ngr = doc['cocit_authors'] fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 msg = f'{"/".join(str(frags[i]) for i in range(1, 6))}' print(f"{i:<3d} '{ngramm}' {msg} ({sum(frags.values())})") # cnt})") # for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1): msg = f'{"/".join(str(cnts[i]) for i in range(1, 6))}' print(f" {j:<3d} '{co}': {msg} ({sum(cnts.values())})")
def print_topics_top_author_by_frags(mdb): """Кросс-распределение «топики» - «со-цитирования»""" print('Г Кросс-распределение «топики» - «со-цитирования»:', 'topics_top_author_by_frags.json') top_topics = get_topn(mdb.topics, 100) contexts = mdb.contexts out_dict = {} for i, (topic, cnt, conts) in enumerate(top_topics, 1): frags = Counter() congr = defaultdict(Counter) for doc in contexts.aggregate([ { '$match': { 'frag_num': { '$gt': 0 }, '_id': { '$in': conts } } }, { '$project': { 'prefix': False, 'suffix': False, 'exact': False } }, { '$unwind': '$cocit_authors' }, ]): ngr = doc['cocit_authors'] fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 crosscocitaith = {} out_dict[topic] = dict(sum=cnt, frags=frags, cocitaithors=crosscocitaith) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1): crosscocitaith[co] = dict(frags=cnts, sum=sum(cnts.values())) with open('../out_json/topics_top_author_by_frags.json', 'w') as out: json.dump(out_dict, out, ensure_ascii=False)
def print_top_author_ngramms_by_frags( mdb, topn:int, *, topn_gramms:int=500, nka:int=2, ltype:str='lemmas' ): """Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»""" print('Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»') if topn_gramms: n_gramms = mdb.n_gramms top_ngramms = get_topn( n_gramms, topn_gramms, preselect=[{'$match': {'nka': nka, 'type': ltype}}], sum_expr='$linked_papers.cnt') exists = frozenset(t for t, _, _ in top_ngramms) contexts = mdb.contexts topN = get_topn_cocit_authors(contexts, topn, include_conts=True) for i, (cocitauthor, cnt, conts) in enumerate(topN, 1): frags = Counter() congr = defaultdict(Counter) for doc in contexts.aggregate([ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, # 'cocit_authors': cocitauthor}}, # {'$project': {'prefix': False, 'suffix': False, 'exact': False}}, {'$lookup': { 'from': 'n_gramms', 'localField': '_id', 'foreignField': 'linked_papers.cont_id', 'as': 'cont'}}, {'$unwind': '$cont'}, {'$match': {'cont.nka': nka, 'cont.type': ltype}}, {'$unwind': '$cont.linked_papers'}, {'$match': {'$expr': {'$eq': ["$_id", "$cont.linked_papers.cont_id"]}}}, {'$project': {'cont.type': False}}, # 'cont.linked_papers': False, # {'$sort': {'frag_num': 1}}, ]): cont = doc['cont'] ngr = cont['title'] if topn_gramms and ngr not in exists: continue fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += cont['linked_papers']['cnt'] msg = f'{"/".join(str(frags[i]) for i in range(1, 6))}' print(f"{i:<3d} '{cocitauthor}' {msg} ({cnt})") # sum(frags.values()) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1): msg = f'{"/".join(str(cnts[i]) for i in range(1, 6))}' print(f" {j:<3d} '{co}': {msg} ({sum(cnts.values())})")
def print_topics_top_author_by_frags(mdb): """Кросс-распределение «топики» - «со-цитирования»""" print('Кросс-распределение «топики» - «со-цитирования»') top_topics = get_topn(mdb.topics, 100) contexts = mdb.contexts for i, (topic, cnt, conts) in enumerate(top_topics, 1): frags = Counter() congr = defaultdict(Counter) for doc in contexts.aggregate([ { '$match': { 'frag_num': { '$gt': 0 }, '_id': { '$in': conts } } }, { '$project': { 'prefix': False, 'suffix': False, 'exact': False } }, { '$unwind': '$cocit_authors' }, ]): ngr = doc['cocit_authors'] fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 msg = f'{"/".join(str(frags[i]) for i in range(1, 6))}' print(f"{i:<3d} '{topic}' {msg} ({sum(frags.values())})") # cnt})") # for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1): msg = f'{"/".join(str(cnts[i]) for i in range(1, 6))}' print(f" {j:<3d} '{co}': {msg} ({sum(cnts.values())})")
def print_top_ngramms_topics_by_frags( mdb, topn:int=10, *, nka:int=2, ltype:str='lemmas' ): """Кросс-распределение «фразы» - «топики контекстов цитирований»""" print('Кросс-распределение «фразы» - «топики контекстов цитирований»') n_gramms = mdb.n_gramms top_ngramms = get_topn( n_gramms, topn, preselect=[{'$match': {'nka': nka, 'type': ltype}}], sum_expr='$linked_papers.cnt') contexts = mdb.contexts for i, (ngrmm, cnt, conts) in enumerate(top_ngramms, 1): frags = Counter() congr = defaultdict(Counter) for doc in contexts.aggregate([ {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, {'$project': {'prefix': False, 'suffix': False, 'exact': False}}, {'$lookup': { 'from': 'topics', 'localField': '_id', 'foreignField': 'linked_papers.cont_id', 'as': 'cont'}}, {'$unwind': '$cont'}, {'$unwind': '$cont.linked_papers'}, {'$match': {'$expr': {'$eq': ["$_id", "$cont.linked_papers.cont_id"]}}}, {'$project': {'cont.type': False}}, # 'cont.linked_papers': False, ]): cont = doc['cont'] ngr = cont['title'] fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 msg = f'{"/".join(str(frags[i]) for i in range(1, 6))}' print(f"{i:<2d} '{ngrmm}' {msg} ({sum(frags.values())})") for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1): msg = f'{"/".join(str(cnts[i]) for i in range(1, 6))}' print(f" {j:<2d} '{co}': {msg} ({sum(cnts.values())})")
def print_top_author_ngramms_by_frags(mdb, topn: int, *, topn_gramms: int = 500, nka: int = 2, ltype: str = 'lemmas'): """Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»""" print( 'Б', 'Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»:', 'top_author_ngramms_by_frags.json') if topn_gramms: n_gramms = mdb.n_gramms top_ngramms = get_topn(n_gramms, topn_gramms, preselect=[{ '$match': { 'nka': nka, 'type': ltype } }], sum_expr='$linked_papers.cnt') exists = frozenset(t for t, _, _ in top_ngramms) else: exists = () contexts = mdb.contexts topN = get_topn_cocit_authors(contexts, topn, include_conts=True) out_dict = {} for i, (cocitauthor, cnt, conts) in enumerate(topN, 1): frags = Counter() congr = defaultdict(Counter) for doc in contexts.aggregate([ { '$match': { 'frag_num': { '$gt': 0 }, '_id': { '$in': conts } } }, # 'cocit_authors': cocitauthor}}, # { '$project': { 'prefix': False, 'suffix': False, 'exact': False } }, { '$lookup': { 'from': 'n_gramms', 'localField': '_id', 'foreignField': 'linked_papers.cont_id', 'as': 'cont' } }, { '$unwind': '$cont' }, { '$match': { 'cont.nka': nka, 'cont.type': ltype } }, { '$unwind': '$cont.linked_papers' }, { '$match': { '$expr': { '$eq': ["$_id", "$cont.linked_papers.cont_id"] } } }, { '$project': { 'cont.type': False } }, # 'cont.linked_papers': False, # {'$sort': {'frag_num': 1}}, ]): cont = doc['cont'] ngr = cont['title'] if topn_gramms and ngr not in exists: continue fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += cont['linked_papers']['cnt'] crossgrams = {} out_dict[cocitauthor] = dict(sum=cnt, frags=frags, crossgrams=crossgrams) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1): crossgrams[co] = dict(frags=cnts, sum=sum(cnts.values())) with open('../out_json/top_author_ngramms_by_frags.json', 'w') as out: json.dump(out_dict, out, ensure_ascii=False)
def print_top_ngramms_topics_by_frags(mdb, topn: int = 10, *, nka: int = 2, ltype: str = 'lemmas'): """Кросс-распределение «фразы» - «топики контекстов цитирований»""" print('В', 'Кросс-распределение «фразы» - «топики контекстов цитирований»:', 'top_ngramms_topics_by_frags.json') n_gramms = mdb.n_gramms top_ngramms = get_topn(n_gramms, topn, preselect=[{ '$match': { 'nka': nka, 'type': ltype } }], sum_expr='$linked_papers.cnt') contexts = mdb.contexts out_dict = {} for i, (ngrmm, cnt, conts) in enumerate(top_ngramms, 1): frags = Counter() congr = defaultdict(Counter) for doc in contexts.aggregate([ { '$match': { 'frag_num': { '$gt': 0 }, '_id': { '$in': conts } } }, { '$project': { 'prefix': False, 'suffix': False, 'exact': False } }, { '$lookup': { 'from': 'topics', 'localField': '_id', 'foreignField': 'linked_papers.cont_id', 'as': 'cont' } }, { '$unwind': '$cont' }, { '$unwind': '$cont.linked_papers' }, { '$match': { '$expr': { '$eq': ["$_id", "$cont.linked_papers.cont_id"] } } }, { '$project': { 'cont.type': False } }, # 'cont.linked_papers': False, ]): cont = doc['cont'] ngr = cont['title'] fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 crosstopics = {} out_dict[ngrmm] = dict(sum=cnt, frags=frags, crosstopics=crosstopics) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1): crosstopics[co] = dict(frags=cnts, sum=sum(cnts.values())) with open('../out_json/top_ngramms_topics_by_frags.json', 'w') as out: json.dump(out_dict, out, ensure_ascii=False)
def print_top_ngramms_top_author_by_frags(mdb, topn: int, *, topn_authors: int = 500, nka: int = 2, ltype: str = 'lemmas'): """Кросс-распределение «фразы» - «со-цитирования»""" print('В', 'Кросс-распределение «фразы» - «со-цитирования»:', 'top_ngramms_top_author_by_frags.json') n_gramms = mdb.n_gramms top_ngramms = get_topn(n_gramms, topn, preselect=[{ '$match': { 'nka': nka, 'type': ltype } }], sum_expr='$linked_papers.cnt') contexts = mdb.contexts out_dict = {} for i, (ngramm, cnt, conts) in enumerate(top_ngramms, 1): frags = Counter() congr = defaultdict(Counter) for doc in contexts.aggregate([ { '$match': { 'frag_num': { '$gt': 0 }, '_id': { '$in': conts } } }, { '$project': { 'prefix': False, 'suffix': False, 'exact': False } }, { '$unwind': '$cocit_authors' }, ]): ngr = doc['cocit_authors'] fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 crosscocitaith = {} out_dict[ngramm] = dict(sum=cnt, frags=frags, cocitaithors=crosscocitaith) for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1): crosscocitaith[co] = dict(frags=cnts, sum=sum(cnts.values())) with open('../out_json/top_ngramms_top_author_by_frags.json', 'w') as out: json.dump(out_dict, out, ensure_ascii=False)
def print_topics_top_ngramm_by_frags(mdb, topn_gramms: int = 10, nka: int = 4, ltype: str = 'lemmas'): """Кросс-распределение «топики» - «фразы»""" print('Кросс-распределение «топики» - «фразы»') top_topics = get_topn(mdb.topics, 100) contexts = mdb.contexts for i, (topic, cnt, conts) in enumerate(top_topics, 1): frags = Counter() congr = defaultdict(Counter) for doc in contexts.aggregate([ { '$match': { 'frag_num': { '$gt': 0 }, '_id': { '$in': conts } } }, { '$project': { 'prefix': False, 'suffix': False, 'exact': False } }, { '$lookup': { 'from': 'n_gramms', 'localField': '_id', 'foreignField': 'linked_papers.cont_id', 'as': 'cont' } }, { '$unwind': '$cont' }, { '$match': { 'cont.nka': nka, 'cont.type': ltype } }, # {'$unwind': '$cont.linked_papers'}, # {'$match': {'$expr': {'$eq': ["$_id", "$cont.linked_papers.cont_id"]}}}, { '$project': { 'cont.type': False } }, # 'cont.linked_papers': False, { '$sort': { 'cont.count_in_linked_papers': -1, 'cont.count_all': -1 } }, # {'$limit': topn_gramms} ]): cont = doc['cont'] ngr = cont['title'] fnum = doc['frag_num'] frags[fnum] += 1 congr[ngr][fnum] += 1 # cont['linked_papers']['cnt'] if len(congr) == topn_gramms: break msg = f'{"/".join(str(frags[i]) for i in range(1, 6))}' print(f"{i:<3d} '{topic}' {msg} ({sum(frags.values())})") for j, (co, cnts) in enumerate( sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1): msg = f'{"/".join(str(cnts[i]) for i in range(1, 6))}' print(f" {j:<3d} '{co}': {msg} ({sum(cnts.values())})")