Beispiel #1
0
def extract_meta(article):
    pc_article = pubcrawler.Article(article)
    return ({
        'meta': {
            'article-ids': pc_article.pub_ids(),
            'article-type': pc_article.article_type(),
            # 'pub-dates': pc_article.pub_dates()
            # Need to fix stuff with dates in Mongo
            'keywords': pc_article.keywords()
        }
    })
Beispiel #2
0
def extract_geonames(article, store_all=False):
    pc_article = pubcrawler.Article(article)
    anno_doc = AnnoDoc(pc_article.body)
    geoname_tier = anno_doc.require_tiers('geonames', via=geoname_annotator)
    geoname_dicts = {}
    for span in geoname_tier:
        geoname = span.metadata['geoname']
        result = {}
        if store_all:
            for key in GEONAME_ATTRS + ['score']:
                result[key] = geoname[key]
        else:
            result['geonameid'] = geoname['geonameid']
        geoname_dicts[result['geonameid']] = result
    return ({
        'index.geonames': 1,
        'geonames': {
            'culled': list(geoname_dicts.values())
        }
    })
Beispiel #3
0
def extract_disease_ontology_keywords(article):
    pc_article = pubcrawler.Article(article)
    anno_doc = AnnoDoc(pc_article.body)
    resolved_keyword_tier = anno_doc.require_tiers('resolved_keywords',
                                                   via=keyword_annotator)
    disease_ontology_keyword_dict = {}
    for span in resolved_keyword_tier:
        for resolution in span.metadata['resolutions']:
            entity = resolution['entity']
            if entity['type'] == 'disease':
                disease_ontology_keyword_dict[entity['id']] = {
                    "keyword": entity['label'],
                    "uri": entity['id']
                }
    return ({
        'index.keywords': 1,
        'keywords': {
            'disease-ontology': list(disease_ontology_keyword_dict.values())
        }
    })
Beispiel #4
0
def extract_disease_ontology_keywords(article):
    pc_article = pubcrawler.Article(article)
    anno_doc = AnnoDoc(pc_article.body)
    anno_doc.add_tier(keyword_annotator)
    infectious_diseases = [(disease.text, resolve_keyword(disease.text))
                           for disease in anno_doc.tiers['keywords'].spans]
    # disease_ontology_keywords = None if len(infectious_diseases) == 0 else annotated_keywords_to_dict_list(infectious_diseases)
    if len(infectious_diseases) == 0:
        disease_ontology_keywords = None
    else:
        seen_keys = []
        disease_ontology_keywords = []
        for keyword_entity in infectious_diseases:
            keyword, uri = keyword_entity
            if keyword in seen_keys:
                continue
            else:
                seen_keys.append(keyword)
                keyword_dict = {
                    "keyword": keyword,
                    "uri": uri[0].entity.toPython()
                }
                keyword_list.append(keyword_dict)
    return ({'keywords': {'disease-ontology': disease_ontology_keywords}})
Beispiel #5
0
def extract_geonames(article):
    pc_article = pubcrawler.Article(article)
    anno_doc = AnnoDoc(pc_article.body)
    candidate_locations = geoname_annotator.get_candidate_geonames(anno_doc)

    # Generate and score features
    features = geoname_annotator.extract_features(candidate_locations)
    feature_weights = dict(
        population_score=2.0,
        synonymity=1.0,
        num_spans_score=0.4,
        short_span_score=(-5),
        NEs_contained=1.2,
        # Distinctness is probably more effective when combined
        # with other features
        distinctness=1.0,
        max_span_score=1.0,
        # close_locations=0.8,
        # closest_location=0.8,
        # containment_level=0.8,
        cannonical_name_used=0.5,
        feature_code_score=0.6,
    )
    for location, feature in zip(candidate_locations, features):
        location['score'] = feature.score(feature_weights)
    culled_locations = [
        location for location in candidate_locations if location['score'] > 50
    ]
    geo_spans = []
    for location in culled_locations:
        # Copy the dict so we don't need to return a custom class.
        location = dict(location)
        for span in location['spans']:
            # TODO: Adjust scores to give geospans that exactly match
            # a corresponding geoname a bonus.
            geo_span = GeoSpan(span.start, span.end, anno_doc, location)
            geo_spans.append(geo_span)
    culled_geospans = geoname_annotator.cull_geospans(geo_spans)
    # props_to_omit = ['spans', 'alternatenames', 'alternateLocations']
    # for geospan in culled_geospans:
    #     # The while loop removes the properties from the parentLocations.
    #     # There will probably only be one parent location.
    #     cur_location = geospan.geoname
    #     while True:
    #         if all([
    #             prop not in cur_location
    #             for prop in props_to_omit
    #         ]):
    #             break
    #         for prop in props_to_omit:
    #             cur_location.pop(prop)
    #         if 'parentLocation' in cur_location:
    #             cur_location = cur_location['parentLocation']
    #         else:
    #             break

    props_to_omit = ['spans', 'alternateLocations']
    # Get candidate geonameids and feature vectors
    all_geonames = []
    for location, feature in zip(candidate_locations, features):
        geoname_dict = location
        for prop in props_to_omit:
            geoname_dict.pop(prop, None)


#         geoname_dict['geonameid'] = location['geonameid']
        geoname_dict['annie_features'] = feature.to_dict()
        all_geonames.append(geoname_dict)

    culled_geonames = []
    for geospan in culled_geospans:
        geoname = geospan.geoname
        for prop in props_to_omit:
            geoname.pop(prop, None)
        culled_geonames.append(geospan.to_dict())
    return ({'geonames': {'all': all_geonames, 'culled': culled_geonames}})