Ejemplo n.º 1
0
async def _req_frags_ngramm_ngramm(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «5 фрагментов» - «фразы из контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  n_gramms = mdb.n_gramms
  topN = await _get_topn_ngramm(
    n_gramms, nka, ltype, topn, title_always_id=True, show_type=True)
  exists = frozenset(t for t, *_ in topN)

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'linked_papers_topics': 0, 'bundles': 0}},
    {'$lookup': {
        'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'cont')]

  pipeline += [
    {'$unwind': '$linked_papers_ngrams'},
    {'$match': {'$expr': {'$eq': ['$linked_papers_ngrams._id', '$cont._id']}}},
  ]

  out_list = []
  contexts = mdb.contexts

  for i, (ngrmm, typ_, cnt, conts) in enumerate(topN, 1):
    congr = defaultdict(Counter)
    titles = {}
    types = {}

    work_pipeline = [
                      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}}
                    ] + pipeline
    # _logger.debug('ngrmm: "%s", cnt: %s, pipeline: %s', ngrmm, cnt, work_pipeline)

    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['cont']
      ngr_id = cont['_id']
      if ngr_id not in exists:
        continue
      fnum = doc['frag_num']
      congr[ngr_id][fnum] += doc['linked_papers_ngrams']['cnt']
      titles[ngr_id] = cont['title']
      # if not ltype:
      types[ngr_id] = cont['type']

    frags = congr.pop(ngrmm)
    crossgrams = []
    otype = ltype if ltype else types[ngrmm]
    # out_list[ngrmm] = dict(sum=cnt, frags=frags, crossgrams=crossgrams)
    out_list.append(
      dict(
        title=titles[ngrmm], type=otype, sum=cnt, cnt_cross=len(congr),
        frags=frags, crossgrams=crossgrams))

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])),
      1
    ):
      crossgrams.append(
        dict(
          title=titles[co], type=types[co], frags=cnts, sum=sum(cnts.values())))

  return json_response(out_list)
Ejemplo n.º 2
0
async def _req_frags_ngramm(request: web.Request) -> web.StreamResponse:
  """Распределение «5 фрагментов» - «фразы из контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  pipeline = [
    {'$match': {
      'frag_num': {'$exists': 1}, 'linked_papers_ngrams': {'$exists': 1}}},
    {'$project': {
      '_id': 1, 'frag_num': 1, 'linked_paper': '$linked_papers_ngrams'}},
    {'$unwind': '$linked_paper'},
    {'$group': {
      '_id': {'_id': '$linked_paper._id', 'frag_num': '$frag_num'},
      'count': {'$sum': '$linked_paper.cnt'},}},
    {'$group': {
      '_id': '$_id._id', 'count': {'$sum': '$count'},
      'frags': {'$push': {'frag_num': '$_id.frag_num', 'count': '$count',}},}},
    {'$sort': {'count': -1, '_id': 1}},
    {'$lookup': {
      'from': 'n_gramms', 'localField': '_id', 'foreignField': '_id',
      'as': 'ngramm'}},
    {'$unwind': '$ngramm'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'ngramm')]

  pipeline += [
    {'$project': {
      'title': '$ngramm.title', 'type': '$ngramm.type', 'nka': '$ngramm.nka',
      'count': '$count', 'frags': '$frags'}}]

  if topn:
    pipeline += [{'$limit': topn}]

  _logger.debug('pipeline: %s', pipeline)
  contexts = mdb.contexts
  out_dict = {}

  async for doc in contexts.aggregate(pipeline):
    title = doc['title']
    cnt = doc['count']
    frags = {n: 0 for n in range(1, 6)}
    frags.update(map(itemgetter('frag_num', 'count'), doc['frags']))
    dtype = doc['type']
    out = dict(sum=cnt, frags=frags)
    if not nka:
      out.update(nka=doc['nka'])
    if ltype:
      out_dict[title] = out
    else:
      out.update(type = dtype, title=title)
      did = doc['_id']
      out_dict[did] = out

  return json_response(out_dict)
Ejemplo n.º 3
0
async def _req_publ_publications_ngramms(
  request: web.Request
) -> web.StreamResponse:
  """Кросс-распределение «фразы из контекстов цитирований» по публикациям"""
  app = request.app
  mdb = app['db']

  publications = mdb.publications
  pubs = {
    pdoc['_id']: pdoc['name']
    async for pdoc in publications.find({'name': {'$exists': True}}).sort('_id')
  }

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  if nka or ltype:
    postmath = [
      {'$match': {
        f: v for f, v in (('cont.nka', nka), ('cont.type', ltype)) if v}}]
  else:
    postmath = None

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'linked_papers_topics': 0,
      'positive_negative': 0, 'bundles': 0},},
    {'$unwind': '$linked_papers_ngrams'},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
  ]
  if postmath:
    pipeline += postmath

  contexts = mdb.contexts
  n_gramms = mdb.n_gramms

  out_pub_list = []

  for pub_id, pub_desc in pubs.items():
    topN = await _get_topn_ngramm(
      n_gramms, nka, ltype, topn, pub_id=pub_id, title_always_id=True,
      show_type=True)
    exists = frozenset(map(itemgetter(0), topN))

    out_list = []
    oconts = set()

    for i, (ngrmm, ntype, cnt, conts) in enumerate(topN, 1):
      congr = defaultdict(set)
      ngrms = {}

      work_pipeline = [
        {'$match': {'_id': {'$in': conts}, 'pub_id': pub_id}}
      ] + pipeline + [
        {'$match': {'cont.type': ntype}}
      ]
      # _logger.debug('pipeline: %s', work_pipeline)
      async for doc in contexts.aggregate(work_pipeline):
        cont = doc['cont']
        ngr_id = cont['_id']
        ngr = cont['title']
        if ngr_id not in exists:
          continue
        cid = doc['_id']
        oconts.add(cid)
        congr[ngr_id].add(cid)
        ngrms[ngr_id] = dict(type=cont['type'], title=ngr, nka=cont['nka'])

      pubs = congr.pop(ngrmm)
      b_ngrm = ngrms.pop(ngrmm)
      crossgrams = []

      for j, (co, vals) in enumerate(
        sorted(congr.items(), key=lambda kv: (-len(kv[1]), kv[0])), 1
      ):
        co_ = ngrms[co]
        crossgrams.append(
          dict(type=co_['type'], title=co_['title'], conts_len=len(vals)))

      out_list.append(dict(
        type=b_ngrm['type'], title=b_ngrm['title'], nka=b_ngrm['nka'],
        conts=tuple(sorted(pubs)), conts_len=len(pubs),
        crossgrams=crossgrams, crossgrams_len=len(crossgrams)))

    out_pub_list.append(dict(
      pub_id=pub_id, descr=pub_desc, ngrams=out_list, ngrams_len=len(out_list),
      conts=tuple(sorted(oconts)), conts_len=len(oconts)))

  return json_response(out_pub_list)
Ejemplo n.º 4
0
async def _req_frags_cocitauthors_ngramms(request: web.Request) -> web.StreamResponse:
  """Б Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn:int = getreqarg_topn(request)
  # if not topn:
  #   topn = 10
  topn_gramm:int = getreqarg_int(request, 'topn_gramm')
  if not topn_gramm:
    topn_gramm = 500

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  if topn_gramm:
    n_gramms = mdb.n_gramms
    top_ngramms = await _get_topn_ngramm(
      n_gramms, nka, ltype, topn_gramm, title_always_id=True)
    exists = frozenset(t for t, _, _ in top_ngramms)
  else:
    exists = ()

  contexts = mdb.contexts
  topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True)

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_topics': 0}},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'cont')]

  pipeline += [
    {'$unwind': '$linked_papers_ngrams'},
    {'$match': {'$expr': {'$eq': ['$cont._id', '$linked_papers_ngrams._id']}}},
  ]

  out_dict = {}
  for i, (cocitauthor, cnt, conts) in enumerate(topN, 1):
    frags = Counter()
    congr = defaultdict(Counter)
    titles = {}
    types = {}

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
    ] +  pipeline

    # _logger.debug('cocitauthor: "%s", cnt: %s, pipeline: %s', cocitauthor, cnt, work_pipeline)
    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['cont']
      ngr_title = cont['title']
      ngr_id = cont['_id']
      if topn_gramm and ngr_id not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      ngr_cnt = doc['linked_papers_ngrams']['cnt']
      congr[ngr_id][fnum] += ngr_cnt
      titles[ngr_id] = ngr_title
      types[ngr_id] = cont['type']

    crossgrams = []
    out_dict[cocitauthor] = dict(sum=cnt, frags=frags, crossgrams=crossgrams)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      ngr = dict(
        title=titles[co], type=types[co], frags=cnts, sum=sum(cnts.values()))
      crossgrams.append(ngr)

  return json_response(out_dict)
Ejemplo n.º 5
0
async def _req_frags_topics_ngramms(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «топики» - «фразы»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  topn_crpssgramm: int = getreqarg_int(request, 'topn_crpssgramm')
  topn_gramm: int = getreqarg_int(request, 'topn_gramm')
  if not topn_gramm:
    topn_gramm = 500

  nka = getreqarg_nka(request)
  ltype = getreqarg_ltype(request)

  if topn_gramm:
    n_gramms = mdb.n_gramms

    top_ngramms = await _get_topn_ngramm(
      n_gramms, nka, ltype, topn_gramm, title_always_id=True)
    exists = frozenset(t for t, _, _ in top_ngramms)
  else:
    exists = ()

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_topics': 0}},
    {'$unwind': '$linked_papers_ngrams'},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'ngrm'}},
    {'$unwind': '$ngrm'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'ngrm')]

  pipeline += [
    {'$sort': {'ngrm.count_in_linked_papers': -1, 'ngrm.count_all': -1}},
  ]

  top_topics = await _get_topn_topics(mdb.topics, topn)
  contexts = mdb.contexts
  out_dict = {}
  zerro_frags = {n: 0 for n in range(1, 6)}
  for i, (topic, cnt, conts) in enumerate(top_topics, 1):
    frags = Counter(zerro_frags)
    congr = defaultdict(partial(Counter, zerro_frags))

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
    ] + pipeline

    # _logger.debug('topic: "%s", cnt: %s, pipeline: %s', topic, cnt, work_pipeline)

    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['ngrm']
      ngr = cont['title']
      if exists and cont['_id'] not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[ngr][fnum] += 1
      if topn_crpssgramm and len(congr) == topn_crpssgramm:
        break

    crossgrams = {}
    out_dict[topic] = dict(sum=cnt, frags=frags, crossgrams=crossgrams)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      crossgrams[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)