Example #1
0
async def _req_frags_ngramms_cocitauthors(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «фразы» - «со-цитирования»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  contexts = mdb.contexts

  topn_authors:int = getreqarg_int(request, 'topn_cocitauthors')
  if topn_authors:
    topNa = await _get_topn_cocit_authors(
      contexts, topn_authors, include_conts=False)
    exists = frozenset(t for t, _ in topNa)
  else:
    exists = ()

  nka:int = getreqarg_int(request, 'nka')
  ltype:str = getreqarg(request, 'ltype')

  n_gramms = mdb.n_gramms
  top_ngramms = await _get_topn_ngramm(
    n_gramms, nka, ltype, topn, title_always_id=True, show_type=True)

  out_dict = []
  for i, (ngramm, typ_, cnt, conts) in enumerate(top_ngramms, 1):
    frags = Counter()
    congr = defaultdict(Counter)
    cnt = 0

    async for doc in contexts.aggregate([
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
      {'$project': {'prefix': False, 'suffix': False, 'exact': False}},
      {'$unwind': '$cocit_authors'},
    ]):
      ngr = doc['cocit_authors']
      if topn_authors and ngr not in exists:
        continue

      cnt += 1
      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[ngr][fnum] += 1

    if cnt:
      crosscocitaith = {}
      # out_dict[ngramm] = dict(
      #   sum=cnt, frags=frags, cocitaithors=crosscocitaith)
      out_dict.append(dict(
        title=ngramm.split('_', 1)[-1], type=typ_,
        sum=cnt, frags=frags, cocitaithors=crosscocitaith))

      for j, (co, cnts) in enumerate(
        sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
      ):
        crosscocitaith[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)
Example #2
0
async def _reg_cnt_ngramm(request: web.Request) -> web.StreamResponse:
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  nka = getreqarg_int(request, 'nka')
  ltype = getreqarg(request, 'ltype')
  if nka or ltype:
    pipeline = [get_ngramm_filter(nka, ltype)]
  else:
    pipeline = []

  pipeline += [{'$sort': {'count_all': -1, 'title': 1, 'type': 1}}]
  if topn:
    pipeline += [{'$limit': topn}]

  out = []
  get_as_tuple = itemgetter('title', 'type', 'linked_papers')
  n_gramms = mdb.n_gramms
  async for doc in n_gramms.aggregate(pipeline):
    title, lt, conts = get_as_tuple(doc)
    res = dict(title=title) if ltype else dict(title=title, type=lt)
    cnt_all = cnt_cont = 0
    pubs = set()
    for cid, cnt in (c.values() for c in conts):
      cnt_cont += 1
      cnt_all += cnt
      pubs.add(cid.rsplit('@', 1)[0])
    res.update(
      count_all=doc['count_all'], count=cnt_all, count_conts=cnt_cont,
      conts_pubs=len(pubs))
    out.append(res)

  return json_response(out)
Example #3
0
async def _req_frags_ngramms_topics(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «фразы» - «топики контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  if not topn:
    topn = 10

  topn_topics:int = getreqarg_int(request, 'topn_topics')
  if topn_topics:
    topics = mdb.topics
    topNt = await _get_topn_topics(topics, topn=topn_topics)
    exists = frozenset(t for t, _, _ in topNt)
  else:
    exists = ()

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  n_gramms = mdb.n_gramms
  top_ngramms = await _get_topn_ngramm(
    n_gramms, nka, ltype, topn, title_always_id=True, show_type=True)

  contexts = mdb.contexts

  out_dict = []
  zerro_frags = {n: 0 for n in range(1, 6)}
  for i, (ngrmm, typ_, cnt, conts) in enumerate(top_ngramms, 1):
    frags = Counter(zerro_frags)
    congr = defaultdict(partial(Counter, zerro_frags))

    pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
      {'$project': {
        'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
        'bundles': 0, 'linked_papers_ngrams': 0}},
      {'$unwind': '$linked_papers_topics'},
    ]
    # _logger.debug('ngrmm: "%s", cnt: %s, pipeline: %s', ngrmm, cnt, pipeline)
    async for doc in contexts.aggregate(pipeline):
      cont = doc['linked_papers_topics']
      topic = cont['_id']
      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[topic][fnum] += 1

    crosstopics = {}
    out_dict.append(dict(
      title=ngrmm.split('_', 1)[-1], type=typ_, sum=cnt, frags=frags,
      crosstopics=crosstopics))

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      crosstopics[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)
Example #4
0
async def _req_frags_cocitauthors_topics(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «со-цитирования» - «топики контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn:int = getreqarg_topn(request)
  topn_topics:int = getreqarg_int(request, 'topn_topics')
  if topn_topics:
    topics = mdb.topics
    top_topics = await _get_topn_topics(topics, topn=topn)
    exists = frozenset(t for t, _, _ in top_topics)
  else:
    exists = ()

  contexts = mdb.contexts
  topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True)

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_ngrams': 0, 'cocit_authors': 0}},
    {'$unwind': '$linked_papers_topics'},
    {'$lookup': {
      'from': 'topics', 'localField': 'linked_papers_topics._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
    {'$match': {'$expr': {'$eq': ['$cont._id', '$linked_papers_topics._id']}}},
  ]

  out_dict = {}
  for i, (cocitauthor, cnt, conts) in enumerate(topN, 1):
    frags = Counter()
    congr = defaultdict(Counter)

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
    ] + pipeline
    # _logger.debug('cocitauthor: "%s", cnt: %s, pipeline: %s', cocitauthor, cnt, work_pipeline)

    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['cont']
      topic = cont['title']
      if topn_topics and topic not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[topic][fnum] += 1

    crosstopics = {}
    out_dict[cocitauthor] = dict(sum=cnt, frags=frags, crosstopics=crosstopics)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      crosstopics[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)
Example #5
0
async def _req_frags_topics_cocitauthors(
  request: web.Request
) -> web.StreamResponse:
  """Кросс-распределение «топики» - «со-цитирования»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  contexts = mdb.contexts

  topn_authors:int = getreqarg_int(request, 'topn_cocitauthors')
  if topn_authors:
    topNa = await _get_topn_cocit_authors(
      contexts, topn_authors, include_conts=False)
    exists = frozenset(t for t, _ in topNa)
  else:
    exists = ()

  topics = mdb.topics
  topN = await _get_topn_topics(topics, topn=topn)

  out_dict = {}
  for i, (topic, cnt, conts) in enumerate(topN, 1):
    frags = Counter()
    congr = defaultdict(Counter)

    async for doc in contexts.aggregate([
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
      {'$project': {'prefix': False, 'suffix': False, 'exact': False}},
      {'$unwind': '$cocit_authors'},
    ]):
      ngr = doc['cocit_authors']
      if topn_authors and ngr not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[ngr][fnum] += 1

    crosscocitaith = {}
    out_dict[topic] = dict(sum=cnt, frags=frags, cocitaithors=crosscocitaith)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      crosscocitaith[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)
Example #6
0
async def _req_top_ngramm(request: web.Request) -> web.StreamResponse:
  """Топ N фраз"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  nka = getreqarg_int(request, 'nka')
  ltype = getreqarg(request, 'ltype')

  n_gramms = mdb.n_gramms
  topN = await _get_topn_ngramm(n_gramms, nka, ltype, topn)

  out = tuple(dict(title=n, contects=conts) for n, _, conts in topN)
  return json_response(out)
Example #7
0
async def _reg_cnt_pubs_ngramm(request: web.Request) -> web.StreamResponse:
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  nka = getreqarg_int(request, 'nka')
  ltype = getreqarg(request, 'ltype')
  if nka or ltype:
    pipeline = [get_ngramm_filter(nka, ltype)]
  else:
    pipeline = []

  n_gramms = mdb.n_gramms
  publications:Collection = mdb.publications
  out = {}
  get_as_tuple = itemgetter('title', 'type', 'linked_papers')

  async for pobj in publications.find({'name': {'$exists': True}}):
    pub_id = pobj['_id']
    pipeline_work = [
      {'$match': {'linked_papers.cont_id': {'$regex': f'^{pub_id}@'}}}
    ] + pipeline
    out_ngrs = []

    cont_starts = pub_id + '@'
    async for obj in n_gramms.aggregate(pipeline_work):
      title, lt, conts = get_as_tuple(obj)
      res = dict(title=title) if ltype else dict(title=title, type=lt)
      res.update(count_all=obj['count_all'])
      cnt_all = cnt_cont = 0
      for cid, cnt in (c.values() for c in conts):
        if cid.startswith(cont_starts):
          cnt_cont += 1
          cnt_all += cnt
      res.update(count=cnt_all, count_conts=cnt_cont,
        # conts=conts
      )
      out_ngrs.append(res)

    out_ngrs = sorted(out_ngrs, key=itemgetter('count'), reverse=True)
    if topn:
      out_ngrs = out_ngrs[:topn]
    out[pub_id] = out_ngrs

  return json_response(out)
Example #8
0
async def _req_top_ngramm_pubs(request: web.Request) -> web.StreamResponse:
  """Топ N фраз по публикациям"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  nka = getreqarg_int(request, 'nka')
  ltype = getreqarg(request, 'ltype')
  if nka or ltype:
    pipeline = [get_ngramm_filter(nka, ltype)]
  else:
    pipeline = []

  pipeline += [
    {'$unwind': '$linked_papers'},
    {'$group': {
      '_id': '$title', 'count': {'$sum': '$linked_papers.cnt'}, 'conts': {
        '$addToSet': {
          'cont_id': '$linked_papers.cont_id', 'cnt': '$linked_papers.cnt'}}}},
    {'$sort': {'count': -1, '_id': 1}},
  ]
  if topn:
    pipeline += [{'$limit': topn}]

  n_gramms = mdb.n_gramms
  get_as_tuple = itemgetter('_id', 'count', 'conts')
  topN = [get_as_tuple(obj) async for obj in n_gramms.aggregate(pipeline)]

  get_pubs = itemgetter('cont_id', 'cnt')
  out = {
    name: dict(
      all=cnt, contects=Counter(
        p for p, n in (
          (c.rsplit('@', 1)[0], n) for c, n in (get_pubs(co) for co in conts))
        for _ in range(n)
      ))
    for name, cnt, conts in topN}
  return json_response(out)
Example #9
0
async def _req_frags_cocitauthors_ngramms(request: web.Request) -> web.StreamResponse:
  """Б Кросс-распределение «со-цитирования» - «фразы из контекстов цитирований»"""
  app = request.app
  mdb = app['db']

  topn:int = getreqarg_topn(request)
  # if not topn:
  #   topn = 10
  topn_gramm:int = getreqarg_int(request, 'topn_gramm')
  if not topn_gramm:
    topn_gramm = 500

  nka:int = getreqarg_nka(request)
  ltype:str = getreqarg_ltype(request)

  if topn_gramm:
    n_gramms = mdb.n_gramms
    top_ngramms = await _get_topn_ngramm(
      n_gramms, nka, ltype, topn_gramm, title_always_id=True)
    exists = frozenset(t for t, _, _ in top_ngramms)
  else:
    exists = ()

  contexts = mdb.contexts
  topN = await _get_topn_cocit_authors(contexts, topn, include_conts=True)

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_topics': 0}},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'cont'}},
    {'$unwind': '$cont'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'cont')]

  pipeline += [
    {'$unwind': '$linked_papers_ngrams'},
    {'$match': {'$expr': {'$eq': ['$cont._id', '$linked_papers_ngrams._id']}}},
  ]

  out_dict = {}
  for i, (cocitauthor, cnt, conts) in enumerate(topN, 1):
    frags = Counter()
    congr = defaultdict(Counter)
    titles = {}
    types = {}

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
    ] +  pipeline

    # _logger.debug('cocitauthor: "%s", cnt: %s, pipeline: %s', cocitauthor, cnt, work_pipeline)
    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['cont']
      ngr_title = cont['title']
      ngr_id = cont['_id']
      if topn_gramm and ngr_id not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      ngr_cnt = doc['linked_papers_ngrams']['cnt']
      congr[ngr_id][fnum] += ngr_cnt
      titles[ngr_id] = ngr_title
      types[ngr_id] = cont['type']

    crossgrams = []
    out_dict[cocitauthor] = dict(sum=cnt, frags=frags, crossgrams=crossgrams)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      ngr = dict(
        title=titles[co], type=types[co], frags=cnts, sum=sum(cnts.values()))
      crossgrams.append(ngr)

  return json_response(out_dict)
Example #10
0
async def _req_publ_cocitauthors_cocitauthors(
  request: web.Request
) -> web.StreamResponse:
  """
  А
  Кросс-распределение «публикации» - «со-цитируемые авторы»
  """
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)
  topn_cocitauthors: int = getreqarg_int(request, 'topn_cocitauthors')

  contexts = mdb.contexts
  topN = await _get_topn_cocit_authors(contexts, topn)

  exists = ()
  if topn_cocitauthors:
    if not topn or topn >= topn_cocitauthors:
      exists = frozenset(map(itemgetter(0), topN[:topn_cocitauthors]))
    else:
      topNa = await _get_topn_cocit_authors(contexts, topn_cocitauthors)
      exists = frozenset(map(itemgetter(0), topNa))

  out_list = []
  for i, (cocitauthor, _) in enumerate(topN, 1):
    cnt = 0
    pubs = set()
    coaut = defaultdict(set)
    async for doc in contexts.find(
      {'cocit_authors': cocitauthor, 'frag_num': {'$gt': 0}},
      projection=['pub_id', 'cocit_authors']
    ).sort('frag_num'):
      # print(i, doc)
      if exists:
        coauthors = frozenset(
          c for c in doc['cocit_authors'] if c != cocitauthor and c in exists)
      else:
        coauthors = frozenset(
          c for c in doc['cocit_authors'] if c != cocitauthor)
      if not coauthors:
        continue
      cnt += 1
      pub_id = doc['pub_id']
      pubs.add(pub_id)
      for ca in coauthors:
        coaut[ca].add(pub_id)

    if not coaut:
      continue

    out_cocitauthors = []
    out_list.append(
      dict(
        title=cocitauthor, cnt_pubs=len(pubs), cnt_cross=len(coaut),
        pubs=tuple(sorted(pubs)), cocitauthors=out_cocitauthors))

    for j, (co, vals) in enumerate(
      sorted(coaut.items(), key=lambda kv: (-len(kv[1]), kv[0])), 1
    ):
      out_cocitauthors.append(
        dict(title=co, cnt_pubs=len(vals), pubs=tuple(sorted(vals))))

  return json_response(out_list)
Example #11
0
async def _req_frags_topics_ngramms(request: web.Request) -> web.StreamResponse:
  """Кросс-распределение «топики» - «фразы»"""
  app = request.app
  mdb = app['db']

  topn = getreqarg_topn(request)

  topn_crpssgramm: int = getreqarg_int(request, 'topn_crpssgramm')
  topn_gramm: int = getreqarg_int(request, 'topn_gramm')
  if not topn_gramm:
    topn_gramm = 500

  nka = getreqarg_nka(request)
  ltype = getreqarg_ltype(request)

  if topn_gramm:
    n_gramms = mdb.n_gramms

    top_ngramms = await _get_topn_ngramm(
      n_gramms, nka, ltype, topn_gramm, title_always_id=True)
    exists = frozenset(t for t, _, _ in top_ngramms)
  else:
    exists = ()

  pipeline = [
    {'$project': {
      'prefix': 0, 'suffix': 0, 'exact': 0, 'positive_negative': 0,
      'bundles': 0, 'linked_papers_topics': 0}},
    {'$unwind': '$linked_papers_ngrams'},
    {'$lookup': {
      'from': 'n_gramms', 'localField': 'linked_papers_ngrams._id',
      'foreignField': '_id', 'as': 'ngrm'}},
    {'$unwind': '$ngrm'},
  ]

  if nka or ltype:
    pipeline += [get_ngramm_filter(nka, ltype, 'ngrm')]

  pipeline += [
    {'$sort': {'ngrm.count_in_linked_papers': -1, 'ngrm.count_all': -1}},
  ]

  top_topics = await _get_topn_topics(mdb.topics, topn)
  contexts = mdb.contexts
  out_dict = {}
  zerro_frags = {n: 0 for n in range(1, 6)}
  for i, (topic, cnt, conts) in enumerate(top_topics, 1):
    frags = Counter(zerro_frags)
    congr = defaultdict(partial(Counter, zerro_frags))

    work_pipeline = [
      {'$match': {'frag_num': {'$gt': 0}, '_id': {'$in': conts}}},
    ] + pipeline

    # _logger.debug('topic: "%s", cnt: %s, pipeline: %s', topic, cnt, work_pipeline)

    async for doc in contexts.aggregate(work_pipeline):
      cont = doc['ngrm']
      ngr = cont['title']
      if exists and cont['_id'] not in exists:
        continue

      fnum = doc['frag_num']
      frags[fnum] += 1
      congr[ngr][fnum] += 1
      if topn_crpssgramm and len(congr) == topn_crpssgramm:
        break

    crossgrams = {}
    out_dict[topic] = dict(sum=cnt, frags=frags, crossgrams=crossgrams)

    for j, (co, cnts) in enumerate(
      sorted(congr.items(), key=lambda kv: (-sum(kv[1].values()), kv[0])), 1
    ):
      crossgrams[co] = dict(frags=cnts, sum=sum(cnts.values()))

  return json_response(out_dict)