Exemple #1
0
 def _list_columns(self,
                   experiment_id: str,
                   stages: List[LifecycleStage],
                   column_type: str,
                   columns: List[str],
                   size: int = 100) -> None:
     s = Search(index="mlflow-runs").filter("match", experiment_id=experiment_id) \
         .filter("terms", lifecycle_stage=stages)
     s.aggs.bucket(column_type, 'nested', path=column_type) \
         .bucket(f'{column_type}_keys', "composite", size=size,
                 sources=[{"key": {"terms": {"field": f'{column_type}.key'}}}])
     response = s.params(size=0).execute()
     new_columns = [
         column.key.key for column in attrgetter(
             f'aggregations.{column_type}.{column_type}_keys.buckets')(
                 response)
     ]
     columns += new_columns
     while (len(new_columns) == size):
         last_col = attrgetter(
             f'aggregations.{column_type}.{column_type}_keys.after_key.key'
         )(response)
         s = Search(index="mlflow-runs").filter("match", experiment_id=experiment_id) \
             .filter("terms", lifecycle_stage=stages)
         s.aggs.bucket(column_type, 'nested', path=column_type) \
             .bucket(f'{column_type}_keys', "composite", size=size,
                     sources=[{"key": {"terms": {"field": f'{column_type}.key'}}}],
                     after={"key": last_col})
         response = s.params(size=0).execute()
         new_columns = [
             column.key.key for column in attrgetter(
                 f'aggregations.{column_type}.{column_type}_keys.buckets')(
                     response)
         ]
         columns += new_columns
Exemple #2
0
def doctor(doc):
    for doc_type in doc:
        start, limit, i, err = 0, 500, 0, 0
        while True:
            try:
                s = Search(using=client, index="doctor-%s" % doc_type).sort()
                s = s[start:start+limit]
                s.params(scroll="1024M")
                res = s.execute()
                if s.count() == 0:
                    break
                for hit in res:
                    i += 1
                    print("%s--%s--%s" % (doc_type, i, hit.name))
                    d = Doctor.nodes.get_or_none(did=hit.document_id)
                    if d: continue
                    data = hit.to_dict()
                    goodat = data.get('goodat', None)
                    description = data.get('description', None)
                    sex = data.get('sex', None),
                    d = Doctor(
                        did=data['document_id'],
                        name=data['name'],
                        goodat="".join(goodat.split()) if goodat else None,
                        sex=''.join(sex.split()) if sex and isinstance(sex, str) else None,
                        description=''.join(description.split()) if description and isinstance(description, str) else None,
                        title=data.get('title', None),
                        sourceUrl=data.get('source_url', None),
                        sourceType=data.get('document_type'),
                        headerUrl=data.get('headerUrl', None),
                    ).save()
                    hs = data.get('hospitals', [])
                    deps = []
                    for h in hs:
                        deps.extend(h['departments'])
                        hos = Hospital.nodes.get_or_none(hid=h['hospital_id'])
                        if hos:
                            d.hospital.connect(hos, {
                                'department': ','.join(h['departments'])
                            })
                    if deps:
                        department = getDepartment(deps[0])
                        if department:
                            d.department.connect(department)
                    province = getProvince(data.get('province', None))
                    city = getCity(data.get('city', None), province)
                    if province:
                        d.province.connect(province)
                    if city:
                        d.city.connect(city)
                    d.save()
                    del data
                start += limit
            except Exception as e:
                print(e)
                err += 1
                if err > 10:
                    break
def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None:
    """
    Queries fatcat search index (the full regular fatcat.wiki release index)
    for search string passed (and some filters), iterates over the result set
    (using scroll), and fetches full release entity (via api.fatcat.wik) for
    each.

    TODO: group by work_id
    """
    api_session = requests_retry_session()

    es_backend = os.environ.get(
        "ELASTICSEARCH_FATCAT_BASE", "https://search.fatcat.wiki"
    )
    es_index = os.environ.get("ELASTICSEARCH_FATCAT_RELEASE_INDEX", "fatcat_release")
    es_client = elasticsearch.Elasticsearch(es_backend)

    search = Search(using=es_client, index=es_index)

    search = search.exclude("terms", release_type=["stub", "component", "abstract"])

    # "Emerald Expert Briefings"
    search = search.exclude("terms", container_id=["fnllqvywjbec5eumrbavqipfym"])

    # ResearchGate
    search = search.exclude("terms", doi_prefix=["10.13140"])

    if fulltext_only:
        search = search.filter("terms", in_ia=True)

    search = search.query(
        Q("query_string", query=query, default_operator="AND", fields=["biblio"])
    )

    print(f"Expecting {search.count()} search hits", file=sys.stderr)

    search = search.params(clear_scroll=False)
    search = search.params(_source=False)

    results = search.scan()
    for hit in results:
        release_id = hit.meta.id
        resp = api_session.get(
            f"https://api.fatcat.wiki/v0/release/{release_id}",
            params={
                "expand": "container,files,filesets,webcaptures",
                "hide": "references",
            },
        )
        resp.raise_for_status()
        row = dict(
            fatcat_hit=hit.meta._d_,
            release_id=release_id,
            fatcat_release=resp.json(),
        )
        print(json.dumps(row, sort_keys=True), file=json_output)
Exemple #4
0
def pr_links_query(paper_ids):
    ''' Get properties of a paper.
    '''
    # Elastic search client
    client = Elasticsearch(conf.get("elasticsearch.hostname"))

    # Targets
    pr_targets = ['PaperId', 'PaperReferenceId', 'FieldOfStudyId']

    # Query results
    references = list()
    citations = list()
    fieldsofstudy = list()

    # Result dictionary
    results = dict()
    for paper_id in paper_ids:
        results[paper_id] = {
            'References': [],
            'Citations': [],
            'FieldsOfStudy': []
        }

    # Query for paper references
    ref_s = Search(index='paperreferences', using=client)
    ref_s = ref_s.query('terms', PaperId=paper_ids)
    ref_s = ref_s.params(request_timeout=TIMEOUT)

    # Convert into dictionary format
    for ref_info in ref_s.scan():
        results[ref_info[pr_targets[0]]]['References'].append(
            ref_info[pr_targets[1]])

    # Query for paper citations
    cit_s = Search(index='paperreferences', using=client)
    cit_s = cit_s.query('terms', PaperReferenceId=paper_ids)
    cit_s = cit_s.params(request_timeout=TIMEOUT)

    # Convert into dictionary format
    for cit_info in cit_s.scan():
        results[cit_info[pr_targets[1]]]['Citations'].append(
            cit_info[pr_targets[0]])

    # Query for paper fields of study
    fos_s = Search(index='paperfieldsofstudy', using=client)
    fos_s = fos_s.query('terms', PaperId=paper_ids)
    fos_s = fos_s.params(request_timeout=TIMEOUT)

    # Convert into dictionary format
    for fos_info in fos_s.scan():
        results[fos_info[pr_targets[0]]]['FieldsOfStudy'].append(
            fos_info[pr_targets[2]])

    # Return results as a dictionary
    return results
Exemple #5
0
class AllSearchDocumentViewSet(BaseSearchDocumentViewSet):
    document = ActionDocument  # This needs to be filled with a valid Document
    serializer_class = (
        ActionSearchSerializer  # This needs to be filled with a valid Serializer
    )

    def __init__(self, *args, **kwargs):
        super(AllSearchDocumentViewSet, self).__init__(*args, **kwargs)

        self.search = Search(
            using=self.client,
            index=list(settings.ELASTICSEARCH_INDEX_NAMES.values()),
            doc_type=self.document._doc_type.name,
        ).sort(*self.ordering)
        self.search.params(preserve_order=False)
Exemple #6
0
    def get_indices(self, docTypes: List = ["default"]) -> str:
        """
        Returns a list of all indexes for the given doc types.

        :param docTypes:        List of Doctypes to search, if empty will search all docTypes
        :return:                A string representing indexes to search. (will use * to regroup multiple indices)
        """

        es = get_es_conn()

        indexNamesStr = ""
        if docTypes:
            s = Search(using=es,
                       index=self.typeIndex,
                       doc_type="directory_type").query("ids", values=docTypes)
            s = s.params(scroll=get_scan_scroll_duration(),
                         size=get_nb_documents_per_scan_scroll())

            indexNamesQuery = s.source(["indexName"])
            indexNamesArr = []
            for indexNamePart in indexNamesQuery.scan():
                indexNamesArr.append(indexNamePart["indexName"])
            indexNamesStr = ','.join(indexNamesArr)
        else:
            indexNamesStr = self.dataIndexPrefix + "*"

        return indexNamesStr
Exemple #7
0
def es_get_papers_fos(paperids):
    s = Search(using=client, index="paperfieldsofstudy")
    s = s.query("terms", PaperId=paperids)
    s = s.params(size=500)
    response = s.execute()
    result = response.to_dict()["hits"]["hits"]
    return result
Exemple #8
0
    def count(self, timeout=30):

        s = Search(using=self.es_client, index=self.resource_index)
        s = s.filter('term', **{'resource.keyword': 'gpc'})

        # Only count completed scans.
        s = s.filter('terms', **{'status.keyword': ['ok', 'failed']})
        # Only count base domains.
        s = s.filter('term', **{'is_base_domain': True})

        # Don't need any actual results - just the count and aggregations.
        s = s[0:0]

        # Use aggregation to count subset that reports support.
        supporting_filters = [{
            'term': {
                'scan_data.found': True
            }
        }, {
            'term': {
                'scan_data.gpc.parsed.gpc': True
            }
        }]
        s.aggs.bucket('supporting',
                      'filter',
                      bool={'filter': supporting_filters})

        s = s.extra(track_total_hits=True)
        s = s.params(request_timeout=timeout)

        response = s.execute()

        supporting_count = response.aggregations.supporting.doc_count

        return response.hits.total.value, supporting_count
Exemple #9
0
def es_get_fos_level(fosids):
    s = Search(using=client, index="fieldsofstudy")
    s = s.query("terms", FieldOfStudyId=fosids)
    s = s.params(size=500)
    response = s.execute()
    result = response.to_dict()["hits"]["hits"]
    return result
Exemple #10
0
    def find_tweetable(self, limit=10, timeout=30):

        s = Search(using=self.es_client, index=self.resource_index)
        s = s.filter('term', **{'resource.keyword': 'gpc'})

        # Only tweet about sites where the last scan succeded, a gpc.json was
        # found, and it indicates support for GPC.
        s = s.filter('term', **{'status.keyword': 'ok'})
        s = s.filter('term', **{'scan_data.found': True})
        s = s.filter('term', **{'scan_data.gpc.parsed.gpc': True})
        # Only tweet about base domains, not subdomains.
        s = s.filter('term', **{'is_base_domain': True})
        # Don't tweet about sites we're previously tweeted about (or may have).
        # We may have set `tweeting` and failed before we could set `tweeted`. In this case, it's
        # unclear if the tweet went out or not - needs to be checked manually.
        s = s.exclude('term', **{'gpcsup.tweeting': True})
        s = s.exclude('term', **{'gpcsup.tweeted': True})

        s = s.sort('update_dt')
        s = s[:limit]
        s = s.params(request_timeout=timeout)

        response = s.execute()

        return [r.domain for r in response]
Exemple #11
0
def es_filter_papers_grant_range(paperids, ts, te):
    Q = {
        "bool": {
            "must": [{
                "terms": {
                    "PaperId": paperids
                }
            }, {
                "range": {
                    "date": {
                        "gte": ts,
                        "lte": te
                    }
                }
            }]
        }
    }
    s = Search(using=client, index="papers")
    s = s.params(size=1000)
    s = s.query(Q)
    response = s.execute()
    result = response.to_dict()["hits"]["hits"]
    data = []
    if result:
        data = [r["_source"]["PaperId"] for r in result]
    else:
        print("[es_filter_papers_grant_range] no result")
    return data
Exemple #12
0
    def find(self,
             sort=None,
             offset=0,
             limit=10,
             count=False,
             timeout=30,
             **filter_params):

        s = Search(using=self.es_client, index=self.resource_index)
        s = s.filter('term', **{'resource.keyword': 'gpc'})

        s = apply_filters(s, **filter_params)

        sort = build_sort(sort)
        if sort:
            s = s.sort(*sort)

        s = s[offset:offset + limit]

        s = s.params(request_timeout=timeout)

        if count:
            s = s.extra(track_total_hits=count)

        response = s.execute()

        sites = [(r.to_dict(), r.meta.score) for r in response]

        return response.hits.total.value, sites
def browse_by_provider(provider, index, page_size, ip, request, filter_dead,
                       page=1, lt=None, li=None):
    """
    Allow users to browse image collections without entering a search query.
    """
    _validate_provider(provider)
    s = Search(index=index)
    s = s.params(preference=str(ip))
    provider_filter = Q('term', provider=provider.lower())
    s = s.filter('bool', should=provider_filter, minimum_should_match=1)
    licenses = lt if lt else li
    s = _filter_licenses(s, licenses)
    start_slice, end_slice = _get_query_slice(s, page_size, page)
    s = s[start_slice:end_slice]
    search_response = s.execute()
    results = _post_process_results(
        s,
        start_slice,
        end_slice,
        page_size,
        search_response,
        request,
        filter_dead
    )

    result_count, page_count = _get_result_and_page_count(
        search_response,
        results,
        page_size
    )

    return results, page_count, result_count
def _get_metrics(es, field_name):
    search = Search(using=es, index=current_app.config['INDEX_NAME'])
    # Traverse down the nesting levels from the root field, until we reach the leaf.
    # Need to traverse until the root, because we have to build the search object
    # by adding Nested aggregations consecutively. For example, a nested "samples.foo"
    # field will result in:
    # Search(...).bucket('samples', Nested(path='samples')).metric(...)
    parts = field_name.split('.')
    bucket = search.aggs
    parent = ''
    nestings = []
    for part in parts:
        parent = '%s.%s' % (parent, part) if parent else part
        if parent in current_app.config['NESTED_PATHS']:
            bucket = bucket.bucket(parent, Nested(path=parent))
            nestings.append(parent)

    bucket.metric('max', Max(field=field_name))
    bucket.metric('min', Min(field=field_name))
    bucket.metric('cardinality', Cardinality(field=field_name))

    aggs = search.params(size=0).execute().aggregations.to_dict()
    for nesting in nestings:
        aggs = aggs.get(nesting)

    return (aggs['min']['value'], aggs['max']['value'],
            aggs['cardinality']['value'])
Exemple #15
0
def get_all_ids(index=None, id_field='recid', last_updated=None, latest_first=False):
    """Get all record or inspire ids of publications in the search index

    :param index: name of index to use.
    :param id_field: elasticsearch field to return. Should be 'recid' or 'inspire_id'
    :return: list of integer ids
    """
    if id_field not in ('recid', 'inspire_id'):
        raise ValueError('Invalid ID field %s' % id_field)

    search = Search(using=es, index=index) \
        .filter("term", doc_type=CFG_PUB_TYPE) \
        .source(fields=[id_field])

    if last_updated:
        search = search.filter("range", **{'last_updated': {'gte': last_updated.isoformat()}})

    if latest_first:
        search = search.sort({'last_updated' : {'order' : 'desc'}})
    else:
        search = search.sort('recid')

    search = search.params(preserve_order=True)

    return [int(h[id_field]) for h in search.scan()]
Exemple #16
0
def es_author_normalize(name):
    name = name.replace("-", "")
    name = name.replace("'", "")
    s = Search(using=client, index="authors")
    s = s.query("match", NormalizedName=name)
    s = s.params(size=500)
    response = s.execute()
    result = response.to_dict()["hits"]["hits"]
    sorted_list = sorted([s["_source"] for s in result if s["_score"] > 16],
                         key=itemgetter("Rank"))
    if len(sorted_list) == 0:
        sorted_list = sorted(
            [s["_source"] for s in result if s["_score"] > 13],
            key=itemgetter("Rank"))
    if len(sorted_list) == 0:
        sorted_list = sorted([s["_source"] for s in result],
                             key=itemgetter("Rank"))
    sorted_list = sorted(sorted_list,
                         key=itemgetter("PaperCount"),
                         reverse=True)
    # print(name)
    # print(sorted_list)
    data = {}
    try:
        data = sorted_list[0]
    except Exception as e:
        print("[es_author_normalize] no result", name, e)
    return data
Exemple #17
0
    def get_update_list_single_process(self):
        """ Find units that needs updating and their sidstopdateret (last updated)
        the sidstopdateret may be inaccurate and thus way to far back in time therefore we cannot use take the largest
        of sidstopdateret from the database. Seems we download like 600 dicts a second with match_all.
        Should take around 2 hours and 30 minuttes then. This takes 30 so i need to save half an hour on downloads.

        :return datetime (min sidstopdateret), list (enhedsnumer, sidstopdateret)
        """
        enh_samtid_map = self.make_samtid_dict()
        oldest_sidstopdateret = datetime.datetime.utcnow().replace(
            tzinfo=pytz.utc) + datetime.timedelta(days=1)
        update_dicts = {
            x: {
                'units': [],
                'sidstopdateret': oldest_sidstopdateret
            }
            for x in self.source_keymap.values()
        }
        if len(enh_samtid_map) == 0:
            return update_dicts
        dummy = CvrConnection.update_info(samtid=-1,
                                          sidstopdateret=self.dummy_date)
        print('Get update time for all data')

        for _type in self.source_keymap.values():
            search = Search(using=self.elastic_client, index=self.index)
            search = search.query('match_all')
            sidst_key = '{0}.sidstOpdateret'.format(_type)
            samt_key = '{0}.samtId'.format(_type)
            field_list = ['_id', sidst_key, samt_key]
            # field_list = ['_id'] + ['{0}.sidstOpdateret'.format(key) for key in self.source_keymap.values()] + \
            #          ['{0}.samtId'.format(key) for key in self.source_keymap.values()]
            search = search.fields(fields=field_list)
            params = {'scroll': self.elastic_search_scroll_time, 'size': 2**12}
            search = search.params(**params)
            print('ElasticSearch Query: ', search.to_dict())
            generator = search.scan()
            for cvr_update in tqdm.tqdm(generator):
                enhedsnummer = int(cvr_update.meta.id)
                raw_dat = cvr_update.to_dict()
                samtid = raw_dat[samt_key][0] if samt_key in raw_dat else None
                sidstopdateret = raw_dat[sidst_key][
                    0] if sidst_key in raw_dat else None
                if sidstopdateret is None or samtid is None:
                    continue
                current_update = enh_samtid_map[
                    enhedsnummer] if enhedsnummer in enh_samtid_map else dummy
                if samtid > current_update.samtid:
                    utc_sidstopdateret = utc_transform(sidstopdateret)
                    update_dicts[_type]['sidstopdateret'] = min(
                        utc_sidstopdateret,
                        update_dicts[_type]['sidstopdateret'])
                    update_dicts[_type]['units'].append(
                        (enhedsnummer, utc_sidstopdateret))
                    # break
        print('Update Info: ')
        print([(k, v['sidstopdateret'], len(v['units']))
               for k, v in update_dicts.items()])
        return update_dicts
Exemple #18
0
    def get_asset_names(self, start):
        s = Search(using='objects', index="objects-asset") \
            .query('prefix', symbol__keyword=start)              \
            .source(['symbol'])
        s = s.params(clear_scroll=False) # Avoid calling DELETE on ReadOnly apis.

        asset_names = [ hit.symbol for hit in s.scan()]
        return asset_names
Exemple #19
0
    def get_accounts(self, account_ids, size=1000):
        s = Search(using='objects', index="objects-account", extra={'size': size })
        s = s.filter('terms', id=account_ids)
        s = s.source([ 'id', 'name', 'options.voting_account'])
        s = s.params(clear_scroll=False) # Avoid calling DELETE on ReadOnly apis.

        accounts = [hit.to_dict() for hit in s.scan()]
        return accounts
Exemple #20
0
    def get_asset_ids(self):
        s = Search(using='objects', index="objects-asset") \
            .query('match_all')                            \
            .source(['id'])
        s = s.params(clear_scroll=False) # Avoid calling DELETE on ReadOnly apis.

        asset_ids = [ hit.id for hit in s.scan()]
        return asset_ids
Exemple #21
0
def paper_info_cache_query(
        paper_ids, batch_size=DEFAULT_BATCH, query_filter=None):
    """ Gets paper info from cache.
    """
    start = datetime.now()

    # Query results
    complete_info = list()
    partial_info  = list()
    seen = set()

    # Query for paper info
    paper_info_s = Search(index = 'paper_info', using = client)
    paper_info_s = paper_info_s.filter('terms', _id = paper_ids)
    paper_info_s = paper_info_s.params(size=DEFAULT_BATCH)
    if query_filter is not None:
        paper_info_s = paper_info_s.query(query_filter)

    # Convert query into dictionary format
    for paper_info in paper_info_s.scan():
        paper_info_res = paper_info.to_dict()

        # Remove the creation date for query
        field_del(paper_info_res, 'CreatedDate')

        # Check the type of the result
        if 'FieldsOfStudy' not in paper_info_res:
            continue

        if paper_info_res['cache_type'] == 'partial':
        # if paper_info_res['cache_type'] == 'partial':
            partial_info.append(paper_info_res)
        else:
            skip = False
            for ref in paper_info_res['References']:
                if 'FieldsOfStudy' not in ref:
                    skip = True
                    continue

            for cit in paper_info_res['Citations']:
                if 'FieldsOfStudy' not in cit:
                    skip = True
                    continue

            if skip:
                continue
            complete_info.append(paper_info_res)

        del paper_info_res['cache_type']

        # Add to seen set
        seen.add(paper_info_res['PaperId'])

    print(batch_size, datetime.now() - start)

    # Check for no results and return
    return {'complete': complete_info, 'partial': partial_info,
            'missing': set(paper_ids) - seen}
Exemple #22
0
def es_issue_count(es_client: Any, container_id: str, year: int, volume: str,
                   issue: str) -> int:
    search = Search(using=es_client, index="fatcat_release")
    search = (search.filter("term", container_id=container_id).filter(
        "term", year=year).filter("term", volume=volume).filter(
            "term", issue=issue).extra(request_cache=True))
    search = search.params(request_cache="true")

    return search.count()
Exemple #23
0
def get_elastic_search_coverage(query: ReleaseQuery) -> dict:

    search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"])
    search = search.query(
        "query_string",
        query=query.q,
        default_operator="AND",
        analyze_wildcard=True,
        allow_leading_wildcard=False,
        lenient=True,
        fields=["biblio"],
    )
    search.aggs.bucket(
        "preservation",
        "terms",
        field="preservation",
        missing="_unknown",
    )
    if query.recent:
        date_today = datetime.date.today()
        start_date = str(date_today - datetime.timedelta(days=60))
        end_date = str(date_today + datetime.timedelta(days=1))
        search = search.filter("range", release_date=dict(gte=start_date, lte=end_date))

    search = search[:0]

    search = search.params(request_cache=True)
    search = search.params(track_total_hits=True)
    resp = wrap_es_execution(search)

    preservation_bucket = agg_to_dict(resp.aggregations.preservation)
    preservation_bucket["total"] = _hits_total_int(resp.hits.total)
    for k in ("bright", "dark", "shadows_only", "none"):
        if k not in preservation_bucket:
            preservation_bucket[k] = 0
    if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]:
        preservation_bucket["none"] += preservation_bucket["shadows_only"]
        preservation_bucket["shadows_only"] = 0
    stats = {
        "total": _hits_total_int(resp.hits.total),
        "preservation": preservation_bucket,
    }

    return stats
Exemple #24
0
def pfos_prop_query(paper_ids):
    ''' Get properties of a paper.
    '''

    # Elastic search client
    client = Elasticsearch(conf.get("elasticsearch.hostname"))

    # Targets
    pfos_targets = ['PaperId', 'FieldOfStudyId']

    # Query for paper affiliation
    pfos_s = Search(index='paperfieldsofstudy', using=client)
    pfos_s = pfos_s.query('terms', PaperId=paper_ids)
    pfos_s = pfos_s.source(pfos_targets)
    pfos_s = pfos_s.params(request_timeout=TIMEOUT)

    # Convert paa into dictionary format
    results = dict()
    fos_ids = set()
    for pfos in pfos_s.scan():
        pfos_res = pfos.to_dict()

        # Get fields
        paper_id = pfos_res['PaperId']
        del pfos_res['PaperId']

        # Author
        if 'FieldOfStudyId' in pfos_res:
            fos_ids.add(pfos_res['FieldOfStudyId'])

        # Aggregate results
        if paper_id in results:
            results[paper_id].append(pfos_res)
        else:
            results[paper_id] = [pfos_res]

    fos_names, fos_levels = fos_name_level_dict_query(list(fos_ids))

    res = dict()
    for p_id, pfos_info_list in results.items():
        pfos_res = list()
        for pfos_info in pfos_info_list:
            if 'FieldOfStudyId' in pfos_info:
                if pfos_info['FieldOfStudyId'] in fos_names:
                    pfos_info['FieldOfStudyName'] = fos_names[
                        pfos_info['FieldOfStudyId']]
                    pfos_info['FieldOfStudyLevel'] = fos_levels[
                        pfos_info['FieldOfStudyId']]
                else:
                    continue
            pfos_res.append(pfos_info)

        res[p_id] = pfos_res

    # Return as dictionary
    return res
Exemple #25
0
def es_get_paper_conf_year(confid, year):
    s = Search(using=client, index="papers")
    s = s.query(
        Q('bool',
          must=[Q('match', ConferenceSeriesId=confid),
                Q('match', Year=year)]))
    s = s.params(size=500)
    response = s.execute()
    result = response.to_dict()["hits"]["hits"]
    return result
Exemple #26
0
def es_search_papers_from_confid(confid, papercnt):
    s = Search(using=client, index="papers") \
        .query("match", ConferenceSeriesId=confid)
    s = s.params(preserve_order=True)
    data = []
    for position, hit in enumerate(s.scan()):
        if position == papercnt:
            break
        data.append(hit.to_dict())
    return data
Exemple #27
0
def es_search_aff_info_from_pid(paperid):
    s = Search(using=client, index="paperauthoraffiliations")
    s = s.query("match", PaperId=paperid)
    s = s.params(size=1000)
    response = s.execute()
    result = response.to_dict()["hits"]["hits"]
    data = []
    if result:
        data = [res["_source"] for res in result]
    else:
        print("[es_search_aff_info_from_pid] no result", authorid)
    return data
Exemple #28
0
def es_get_paper_fos(paperid):
    s = Search(using=client, index="paperfieldsofstudy")
    s = s.query("match", PaperId=paperid)
    s = s.params(size=500)
    response = s.execute()
    result = response.to_dict()["hits"]["hits"]
    data = []
    if result:
        data = [res["_source"] for res in result]
    else:
        print("[es_get_paper_fos] no result", paperid)
    return data
Exemple #29
0
def fos_name_query(fos_ids):
    if not fos_ids:
        return []

    fos_target = 'NormalizedName'

    # Query for paa
    fos_s = Search(index='fieldsofstudy', using=client)
    fos_s = fos_s.query('terms', FieldOfStudyId=fos_ids)
    fos_s = fos_s.source(fos_target)
    fos_s = fos_s.params(request_timeout=30)

    return list(map(itemgetter(fos_target), fos_s.scan()))
Exemple #30
0
 def download_all_data_to_file(self, filename):
     """
     :return:
     str: filename, datetime: download time, bool: new download or use old file
     """
     params = {
         'scroll': self.elastic_search_scroll_time,
         'size': self.elastic_search_scan_size
     }
     search = Search(using=self.elastic_client, index=self.index)
     search = search.query('match_all')
     search = search.params(**params)
     download_all_dicts_to_file(filename, search)
    def search(self, index, doc_type, col_filters=None):

        # find whether we have a search alias for the given index
        if index in settings.ES_ROLLOVER:
            index = settings.ES_ROLLOVER[index]['search_index']

        logger.debug("Searching index %s for doc_type %s and col_filters %s"
                     % (index, doc_type, col_filters))

        s = Search(using=self.client, index=index, doc_type=doc_type)
        if col_filters:
            for col_filter in col_filters:
                if isinstance(col_filter, ColumnFilter):
                    s = s.filter(col_filter.query_type, **col_filter.query)
                else:
                    raise ValueError('Column Filter is not an instance of'
                                     ' ColumnFilter class')
        s = s.params(size=MAX_NUMBER_DOCS)

        results = s.execute()

        logger.debug("Search returned %s records from elasticsearch."
                     % len(results))
        return [res.to_dict() for res in results]
Exemple #32
0
class Elastic(LogProvider):
    def __init__(self, config_file='config.cfg'):
        super(Elastic, self).__init__()

        self.percentage=10.0
        self.minimum_occurrences=250

# The ConfigParser documentation points out that there's no way to force defaults config option
# outside the "DEFAULT" section.
        config = ConfigParser()
        config.read(config_file)
        if not config.has_section('elastic'):
            config.add_section('elastic')
        
        for option, value in {'use_ssl': 'True', 'host': '127.0.0.1', 'version': '2', 'index': 'nxapi', 'doc_type': 'events'}.items():
            if not config.has_option('elastic', option):
                config.set('elastic', option, value)

        self.version = config.getint('elastic', 'version')
        self.index = config.get('elastic', 'index')
        use_ssl = config.getboolean('elastic', 'use_ssl')
        host = config.get('elastic', 'host')
        self.doc_type = config.get('elastic', 'doc_type')
        self.client = connections.create_connection(hosts=[host], use_ssl=use_ssl, index=self.index, version=self.version, doc_type=self.doc_type, timeout=30, retry_on_timeout=True )

        Event.init(index=self.index)
        index = Index(self.index, using=self.client)
        index.doc_type(Event)
        self.initialize_search()

    def initialize_search(self):
        self.search = Search(using=self.client, index=self.index).extra(size=10000)
        
    def export_search(self):
        return self.search

    def import_search(self, search):
        self.search = search

    def get_filters(self):
        return self.search.to_dict()

    def add_filters(self, filters, regexp=False, negative=False):
        """
        Add `filters` to the query.
         `filters is a dict of the form {'field': value, field2: value2}, but you can also use a list of values
         instead of a `str`. They'll be added as a _or_ (and not a _and_).
        :param dict filters:
        :param bool regexp:
        :param bool negative:
        :return:
        """
        # We need to use multi_match, since we get the fields names dynamically.
        for key, value in filters.items():
            if isinstance(value, set):
                value = list(value)

            # There is no need to process empty values.
            if not value:
                continue

            if isinstance(value, list):
                if negative:
                    self.search = self.search.query(Q('bool', must_not=[
                        reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])])
                    )
                else:
                    self.search = self.search.query(Q('bool', must=[
                        reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])])
                    )
            else:
                if negative:
                    self.search = self.search.query(~Q("multi_match", query=value, fields=[key]))
                else:
                    self.search = self.search.query(Q("multi_match", query=value, fields=[key]))

    def get_top(self, field, size=250):
        """
        Get the top values for the given `field`
        :param str field: the field to filter on
        :param int size: how many top values to return, top
        :return dict of int: A structure of the form {value: number_of_hits, value2: numer_of_hits2}
        """
        search = self.search
        ret = dict()

        if field in ['uri', 'vers', 'comments', 'server']:
            field = ''.join((field, '.raw'))

        if VERSION < (5, 0, 0):
            self.search = self.search.params(search_type='count', default_operator='AND')
        else:
            self.search = self.search.params(search_type='query_then_fetch')
        # This documented at https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search
        # search_type='count' has been deprecated in ES 2.0
        self.search.aggs.bucket('TEST', 'terms', field=field)
        for hit in self.search.execute(ignore_cache=True).aggregations['TEST']['buckets']:
            ret[hit['key']] = hit['doc_count']
        self.search = search
        return ret

    def get_relevant_ids(self, fields, percentage=0, minimum_occurrences=0):
        """ This function is supposed to return the id that are reparteed/present on the `fields`.

         :param list of str fields:
         :param float percentage:
         :param float minimum_occurrences:
         :return set of int:
         """
        minimum_occurences = minimum_occurrences or self.minimum_occurrences
        percentage = percentage or self.percentage

        ret = set()
        search = self.search
        ids = set(i['id'] for i in self.search.execute())  # get all possible ID
        self.search = search

        for _id in ids:
            search = self.search

            self.add_filters({'id': _id})

            # Get how many different fields there are for a given `id`
            data = collections.defaultdict(set)
            fields_counter = collections.defaultdict(int)
            for res in self.search.execute():
                for field in fields:
                    if res[field] not in data[field]:
                        fields_counter[field] += 1.0
                    data[field].add(res[field])

            # Ignore id that are present on less than 10% of different values of each fields
            for field, content in data.items():
                if len(content) < minimum_occurrences:
                    logging.debug('Discarding id \033[32m%s\033[0m only present %d times.', _id, len(content))
                    continue
                _percentage = len(content) / fields_counter[field] * 100.0
                if _percentage > percentage:
                    continue
                logging.debug('Discarding id \033[32m%s\033[0m present in %d%% of different values of the \033[32m%s\033[0m field', _id, _percentage, field)
                break
            else:
                ret.add(_id)
            self.search = search

        return ret

    def reset_filters(self):
        self.search = Search(using=self.client, index=self.index).extra(size=10000)

    def get_results(self):
        """
        Return a `Result` object obtained from the execution of the search `self.search`.
        :return Result: The `Result` object obtained from the execution of the search `self.search`.
        """
        search = self.search
        result = self.search.scan()
        self.search = search
        return result

    def commit(self):
        """Process list of dict (yes) and push them to DB """
        self.total_objs += len(self.nlist)
        count = 0

        def gen_events(events):
            dicts = list()
            for d in events:
                dicts.extend([{'index': {'_index': 'nxapi', '_type': 'events'}}, d.to_dict()])
                yield dicts.pop(-2)
                yield dicts.pop(-1)


        events = list()
        for entry in self.nlist:
            event = Event(_index=self.index)
            for key, value in entry.items():
                setattr(event, key, value)

            event.whitelisted = False
            event.comments = "import on"+str(datetime.datetime.now())
            events.append(event)
            count += 1

        try:
            ret = self.client.bulk(gen_events(events))
            ## ToDo parse ret to selectively loop over events to events.save() whatever happens
        except TransportError as e:
            logging.warning("We encountered an error trying to continue.")
            for event in events:
                event.save(using=self.client)
                ## ToDo find a way to change the hardcoded 'events' for ES doctype
                ## elasticsearch_dsl Issue 689
               
        self.total_commits += count
        logging.debug("Written "+str(self.total_commits)+" events")
        del self.nlist[0:len(self.nlist)]