コード例 #1
0
def print_top_ngramms_top_author_by_frags(
  mdb, topn:int, *, topn_authors:int=500, nka:int=2, ltype:str='lemmas'
):
  """Кросс-распределение «фразы» - «со-цитирования»"""
  print('Кросс-распределение «фразы» - «со-цитирования»')

  n_gramms = mdb.n_gramms
  top_ngramms = get_topn(
    n_gramms, topn, preselect=[{'$match': {'nka': nka, 'type': ltype}}],
    sum_expr='$linked_papers.cnt')

  contexts = mdb.contexts

  for i, (ngramm, cnt, conts) in enumerate(top_ngramms, 1):
    frags = Counter()
    congr = defaultdict(Counter)

    for doc in contexts.aggregate([
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
      {'$project': {'prefix': False, 'suffix': False, 'exact': False}},
      {'$unwind': '$cocit_authors'},
    ]):
      ngr = doc['cocit_authors']

      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[ngr][fnum] += 1

    msg = f'{"/".join(str(frags[i]) for i in range(1, 6))}'
    print(f"{i:<3d} '{ngramm}' {msg} ({sum(frags.values())})") # cnt})") #

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1):
      msg = f'{"/".join(str(cnts[i]) for i in range(1, 6))}'
      print(f"   {j:<3d} '{co}': {msg} ({sum(cnts.values())})")
コード例 #2
0
ファイル: select4reportDjson.py プロジェクト: tonal/cirtec_db
def print_topics_top_author_by_frags(mdb):
    """Кросс-распределение «топики» - «со-цитирования»"""
    print('Г Кросс-распределение «топики» - «со-цитирования»:',
          'topics_top_author_by_frags.json')

    top_topics = get_topn(mdb.topics, 100)

    contexts = mdb.contexts
    out_dict = {}
    for i, (topic, cnt, conts) in enumerate(top_topics, 1):
        frags = Counter()
        congr = defaultdict(Counter)

        for doc in contexts.aggregate([
            {
                '$match': {
                    'frag_num': {
                        '$gt': 0
                    },
                    '_id': {
                        '$in': conts
                    }
                }
            },
            {
                '$project': {
                    'prefix': False,
                    'suffix': False,
                    'exact': False
                }
            },
            {
                '$unwind': '$cocit_authors'
            },
        ]):
            ngr = doc['cocit_authors']

            fnum = doc['frag_num']
            frags[fnum] += 1
            congr[ngr][fnum] += 1

        crosscocitaith = {}
        out_dict[topic] = dict(sum=cnt,
                               frags=frags,
                               cocitaithors=crosscocitaith)

        for j, (co, cnts) in enumerate(
                sorted(congr.items(),
                       key=lambda kv: (-sum(kv[1].values()), kv[0])), 1):
            crosscocitaith[co] = dict(frags=cnts, sum=sum(cnts.values()))

    with open('../out_json/topics_top_author_by_frags.json', 'w') as out:
        json.dump(out_dict, out, ensure_ascii=False)
コード例 #3
0
def print_top_author_ngramms_by_frags(
  mdb, topn:int, *, topn_gramms:int=500, nka:int=2, ltype:str='lemmas'
):
  """Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»"""
  print('Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»')

  if topn_gramms:
    n_gramms = mdb.n_gramms
    top_ngramms = get_topn(
      n_gramms, topn_gramms, preselect=[{'$match': {'nka': nka, 'type': ltype}}],
      sum_expr='$linked_papers.cnt')
    exists = frozenset(t for t, _, _ in top_ngramms)

  contexts = mdb.contexts
  topN = get_topn_cocit_authors(contexts, topn, include_conts=True)

  for i, (cocitauthor, cnt, conts) in enumerate(topN, 1):
    frags = Counter()
    congr = defaultdict(Counter)

    for doc in contexts.aggregate([
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}, # 'cocit_authors': cocitauthor}}, #
      {'$project': {'prefix': False, 'suffix': False, 'exact': False}},
      {'$lookup': {
        'from': 'n_gramms', 'localField': '_id',
        'foreignField': 'linked_papers.cont_id', 'as': 'cont'}},
      {'$unwind': '$cont'},
      {'$match': {'cont.nka': nka, 'cont.type': ltype}},
      {'$unwind': '$cont.linked_papers'},
      {'$match': {'$expr': {'$eq': ["$_id", "$cont.linked_papers.cont_id"]}}},
      {'$project': {'cont.type': False}}, # 'cont.linked_papers': False,
      # {'$sort': {'frag_num': 1}},
    ]):
      cont = doc['cont']
      ngr = cont['title']
      if topn_gramms and ngr not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[ngr][fnum] += cont['linked_papers']['cnt']

    msg = f'{"/".join(str(frags[i]) for i in range(1, 6))}'
    print(f"{i:<3d} '{cocitauthor}' {msg} ({cnt})") # sum(frags.values())

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1):
      msg = f'{"/".join(str(cnts[i]) for i in range(1, 6))}'
      print(f"   {j:<3d} '{co}': {msg} ({sum(cnts.values())})")
コード例 #4
0
def print_topics_top_author_by_frags(mdb):
    """Кросс-распределение «топики» - «со-цитирования»"""
    print('Кросс-распределение «топики» - «со-цитирования»')

    top_topics = get_topn(mdb.topics, 100)

    contexts = mdb.contexts
    for i, (topic, cnt, conts) in enumerate(top_topics, 1):
        frags = Counter()
        congr = defaultdict(Counter)

        for doc in contexts.aggregate([
            {
                '$match': {
                    'frag_num': {
                        '$gt': 0
                    },
                    '_id': {
                        '$in': conts
                    }
                }
            },
            {
                '$project': {
                    'prefix': False,
                    'suffix': False,
                    'exact': False
                }
            },
            {
                '$unwind': '$cocit_authors'
            },
        ]):
            ngr = doc['cocit_authors']

            fnum = doc['frag_num']
            frags[fnum] += 1
            congr[ngr][fnum] += 1

        msg = f'{"/".join(str(frags[i]) for i in range(1, 6))}'
        print(f"{i:<3d} '{topic}' {msg} ({sum(frags.values())})")  # cnt})") #

        for j, (co, cnts) in enumerate(
                sorted(congr.items(),
                       key=lambda kv: (-sum(kv[1].values()), kv[0])), 1):
            msg = f'{"/".join(str(cnts[i]) for i in range(1, 6))}'
            print(f"   {j:<3d} '{co}': {msg} ({sum(cnts.values())})")
コード例 #5
0
def print_top_ngramms_topics_by_frags(
  mdb, topn:int=10, *, nka:int=2, ltype:str='lemmas'
):
  """Кросс-распределение «фразы» - «топики контекстов цитирований»"""
  print('Кросс-распределение «фразы» - «топики контекстов цитирований»')

  n_gramms = mdb.n_gramms
  top_ngramms = get_topn(
    n_gramms, topn, preselect=[{'$match': {'nka': nka, 'type': ltype}}],
    sum_expr='$linked_papers.cnt')

  contexts = mdb.contexts

  for i, (ngrmm, cnt, conts) in enumerate(top_ngramms, 1):
    frags = Counter()
    congr = defaultdict(Counter)

    for doc in contexts.aggregate([
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
      {'$project': {'prefix': False, 'suffix': False, 'exact': False}},
      {'$lookup': {
        'from': 'topics', 'localField': '_id',
        'foreignField': 'linked_papers.cont_id', 'as': 'cont'}},
      {'$unwind': '$cont'},
      {'$unwind': '$cont.linked_papers'},
      {'$match': {'$expr': {'$eq': ["$_id", "$cont.linked_papers.cont_id"]}}},
      {'$project': {'cont.type': False}}, # 'cont.linked_papers': False,
    ]):
      cont = doc['cont']
      ngr = cont['title']
      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[ngr][fnum] += 1

    msg = f'{"/".join(str(frags[i]) for i in range(1, 6))}'
    print(f"{i:<2d} '{ngrmm}' {msg} ({sum(frags.values())})")

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1):
      msg = f'{"/".join(str(cnts[i]) for i in range(1, 6))}'
      print(f"   {j:<2d} '{co}': {msg} ({sum(cnts.values())})")
コード例 #6
0
ファイル: select4reportBjson.py プロジェクト: tonal/cirtec_db
def print_top_author_ngramms_by_frags(mdb,
                                      topn: int,
                                      *,
                                      topn_gramms: int = 500,
                                      nka: int = 2,
                                      ltype: str = 'lemmas'):
    """Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»"""
    print(
        'Б',
        'Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»:',
        'top_author_ngramms_by_frags.json')

    if topn_gramms:
        n_gramms = mdb.n_gramms
        top_ngramms = get_topn(n_gramms,
                               topn_gramms,
                               preselect=[{
                                   '$match': {
                                       'nka': nka,
                                       'type': ltype
                                   }
                               }],
                               sum_expr='$linked_papers.cnt')
        exists = frozenset(t for t, _, _ in top_ngramms)
    else:
        exists = ()

    contexts = mdb.contexts
    topN = get_topn_cocit_authors(contexts, topn, include_conts=True)

    out_dict = {}
    for i, (cocitauthor, cnt, conts) in enumerate(topN, 1):
        frags = Counter()
        congr = defaultdict(Counter)

        for doc in contexts.aggregate([
            {
                '$match': {
                    'frag_num': {
                        '$gt': 0
                    },
                    '_id': {
                        '$in': conts
                    }
                }
            },  # 'cocit_authors': cocitauthor}}, #
            {
                '$project': {
                    'prefix': False,
                    'suffix': False,
                    'exact': False
                }
            },
            {
                '$lookup': {
                    'from': 'n_gramms',
                    'localField': '_id',
                    'foreignField': 'linked_papers.cont_id',
                    'as': 'cont'
                }
            },
            {
                '$unwind': '$cont'
            },
            {
                '$match': {
                    'cont.nka': nka,
                    'cont.type': ltype
                }
            },
            {
                '$unwind': '$cont.linked_papers'
            },
            {
                '$match': {
                    '$expr': {
                        '$eq': ["$_id", "$cont.linked_papers.cont_id"]
                    }
                }
            },
            {
                '$project': {
                    'cont.type': False
                }
            },  # 'cont.linked_papers': False,
                # {'$sort': {'frag_num': 1}},
        ]):
            cont = doc['cont']
            ngr = cont['title']
            if topn_gramms and ngr not in exists:
                continue

            fnum = doc['frag_num']
            frags[fnum] += 1
            congr[ngr][fnum] += cont['linked_papers']['cnt']

        crossgrams = {}
        out_dict[cocitauthor] = dict(sum=cnt,
                                     frags=frags,
                                     crossgrams=crossgrams)

        for j, (co, cnts) in enumerate(
                sorted(congr.items(),
                       key=lambda kv: (-sum(kv[1].values()), kv[0])), 1):
            crossgrams[co] = dict(frags=cnts, sum=sum(cnts.values()))

    with open('../out_json/top_author_ngramms_by_frags.json', 'w') as out:
        json.dump(out_dict, out, ensure_ascii=False)
コード例 #7
0
def print_top_ngramms_topics_by_frags(mdb,
                                      topn: int = 10,
                                      *,
                                      nka: int = 2,
                                      ltype: str = 'lemmas'):
    """Кросс-распределение «фразы» - «топики контекстов цитирований»"""
    print('В',
          'Кросс-распределение «фразы» - «топики контекстов цитирований»:',
          'top_ngramms_topics_by_frags.json')

    n_gramms = mdb.n_gramms
    top_ngramms = get_topn(n_gramms,
                           topn,
                           preselect=[{
                               '$match': {
                                   'nka': nka,
                                   'type': ltype
                               }
                           }],
                           sum_expr='$linked_papers.cnt')

    contexts = mdb.contexts

    out_dict = {}
    for i, (ngrmm, cnt, conts) in enumerate(top_ngramms, 1):
        frags = Counter()
        congr = defaultdict(Counter)

        for doc in contexts.aggregate([
            {
                '$match': {
                    'frag_num': {
                        '$gt': 0
                    },
                    '_id': {
                        '$in': conts
                    }
                }
            },
            {
                '$project': {
                    'prefix': False,
                    'suffix': False,
                    'exact': False
                }
            },
            {
                '$lookup': {
                    'from': 'topics',
                    'localField': '_id',
                    'foreignField': 'linked_papers.cont_id',
                    'as': 'cont'
                }
            },
            {
                '$unwind': '$cont'
            },
            {
                '$unwind': '$cont.linked_papers'
            },
            {
                '$match': {
                    '$expr': {
                        '$eq': ["$_id", "$cont.linked_papers.cont_id"]
                    }
                }
            },
            {
                '$project': {
                    'cont.type': False
                }
            },  # 'cont.linked_papers': False,
        ]):
            cont = doc['cont']
            ngr = cont['title']
            fnum = doc['frag_num']
            frags[fnum] += 1
            congr[ngr][fnum] += 1

        crosstopics = {}
        out_dict[ngrmm] = dict(sum=cnt, frags=frags, crosstopics=crosstopics)

        for j, (co, cnts) in enumerate(
                sorted(congr.items(),
                       key=lambda kv: (-sum(kv[1].values()), kv[0])), 1):
            crosstopics[co] = dict(frags=cnts, sum=sum(cnts.values()))

    with open('../out_json/top_ngramms_topics_by_frags.json', 'w') as out:
        json.dump(out_dict, out, ensure_ascii=False)
コード例 #8
0
def print_top_ngramms_top_author_by_frags(mdb,
                                          topn: int,
                                          *,
                                          topn_authors: int = 500,
                                          nka: int = 2,
                                          ltype: str = 'lemmas'):
    """Кросс-распределение «фразы» - «со-цитирования»"""
    print('В', 'Кросс-распределение «фразы» - «со-цитирования»:',
          'top_ngramms_top_author_by_frags.json')

    n_gramms = mdb.n_gramms
    top_ngramms = get_topn(n_gramms,
                           topn,
                           preselect=[{
                               '$match': {
                                   'nka': nka,
                                   'type': ltype
                               }
                           }],
                           sum_expr='$linked_papers.cnt')

    contexts = mdb.contexts

    out_dict = {}
    for i, (ngramm, cnt, conts) in enumerate(top_ngramms, 1):
        frags = Counter()
        congr = defaultdict(Counter)

        for doc in contexts.aggregate([
            {
                '$match': {
                    'frag_num': {
                        '$gt': 0
                    },
                    '_id': {
                        '$in': conts
                    }
                }
            },
            {
                '$project': {
                    'prefix': False,
                    'suffix': False,
                    'exact': False
                }
            },
            {
                '$unwind': '$cocit_authors'
            },
        ]):
            ngr = doc['cocit_authors']

            fnum = doc['frag_num']
            frags[fnum] += 1
            congr[ngr][fnum] += 1

        crosscocitaith = {}
        out_dict[ngramm] = dict(sum=cnt,
                                frags=frags,
                                cocitaithors=crosscocitaith)

        for j, (co, cnts) in enumerate(
                sorted(congr.items(),
                       key=lambda kv: (-sum(kv[1].values()), kv[0])), 1):
            crosscocitaith[co] = dict(frags=cnts, sum=sum(cnts.values()))

    with open('../out_json/top_ngramms_top_author_by_frags.json', 'w') as out:
        json.dump(out_dict, out, ensure_ascii=False)
コード例 #9
0
def print_topics_top_ngramm_by_frags(mdb,
                                     topn_gramms: int = 10,
                                     nka: int = 4,
                                     ltype: str = 'lemmas'):
    """Кросс-распределение «топики» - «фразы»"""
    print('Кросс-распределение «топики» - «фразы»')

    top_topics = get_topn(mdb.topics, 100)

    contexts = mdb.contexts
    for i, (topic, cnt, conts) in enumerate(top_topics, 1):
        frags = Counter()
        congr = defaultdict(Counter)
        for doc in contexts.aggregate([
            {
                '$match': {
                    'frag_num': {
                        '$gt': 0
                    },
                    '_id': {
                        '$in': conts
                    }
                }
            },
            {
                '$project': {
                    'prefix': False,
                    'suffix': False,
                    'exact': False
                }
            },
            {
                '$lookup': {
                    'from': 'n_gramms',
                    'localField': '_id',
                    'foreignField': 'linked_papers.cont_id',
                    'as': 'cont'
                }
            },
            {
                '$unwind': '$cont'
            },
            {
                '$match': {
                    'cont.nka': nka,
                    'cont.type': ltype
                }
            },
                # {'$unwind': '$cont.linked_papers'},
                # {'$match': {'$expr': {'$eq': ["$_id", "$cont.linked_papers.cont_id"]}}},
            {
                '$project': {
                    'cont.type': False
                }
            },  # 'cont.linked_papers': False,
            {
                '$sort': {
                    'cont.count_in_linked_papers': -1,
                    'cont.count_all': -1
                }
            },
                # {'$limit': topn_gramms}
        ]):
            cont = doc['cont']
            ngr = cont['title']

            fnum = doc['frag_num']
            frags[fnum] += 1
            congr[ngr][fnum] += 1  # cont['linked_papers']['cnt']
            if len(congr) == topn_gramms:
                break

        msg = f'{"/".join(str(frags[i]) for i in range(1, 6))}'
        print(f"{i:<3d} '{topic}' {msg} ({sum(frags.values())})")

        for j, (co, cnts) in enumerate(
                sorted(congr.items(),
                       key=lambda kv: (-sum(kv[1].values()), kv[0])), 1):
            msg = f'{"/".join(str(cnts[i]) for i in range(1, 6))}'
            print(f"   {j:<3d} '{co}': {msg} ({sum(cnts.values())})")