def func_dbpedia_spotlight(d): """ Helper function for processing a paper in a thread with DBpedia Spotlight :param d: content of the paper :return: result of the annotation with DBpedia Spotlight in JSON || None if the JSON annotation exists already """ d_json = {} paper_id = d['paper_id'] title = d["metadata"]["title"] if os.path.isfile(path_output + '/dbpedia-spotlight/' + folder + '/' + paper_id + '.json'): pbar.update() return None try: body_text = cotools.text(d) isreliable, textbytesfound, details, vectors = pycld2.detect( body_text, returnVectors=True) lang = vectors[0][3] # None or out of range except: lang = 'en' if os.path.isfile('/data/CORD19-Annotation-multi/entity-fishing/' + folder + '/' + paper_id + '.json'): return None d_json["paper_id"] = paper_id d_json["lang"] = lang try: abstract = cotools.abstract(d) d_json["abstract"] = wa.request_dbpedia_spotlight(abstract, lang) # no abstract except Exception: pass d_json["title"] = wa.request_dbpedia_spotlight(title, lang) d_json["body_text"] = wa.request_dbpedia_spotlight(body_text, lang) d_json["ref_entries"] = {} for key, value in d["ref_entries"].items(): d_json["ref_entries"][key] = wa.request_dbpedia_spotlight( value["text"]) #d_json["bib_entries"] = {} #for key, value in d["bib_entries"].items(): # d_json["bib_entries"][key] = wa.request_dbpedia_spotlight(value["title"]) d_json["back_matter"] = [] for matter in d["back_matter"]: for key, value in matter.items(): if key == 'text': text = {'text': wa.request_dbpedia_spotlight(value)} d_json["back_matter"].append(text) Output().save_json( d_json, path_output + '/dbpedia-spotlight/' + folder + '/' + d["paper_id"] + '.json') pbar.update() return d_json
print(str(sys.getsizeof(data)) + ' bytes') print(f"{len(data)} papers") print() print("How data[index] looks like:") pprint(data[13]) print() print("How text looks like") pprint(co.text(data[13])) print() print("How abstract looks like") try: pprint(co.abstract(data[13])) except KeyError: print("Abstract Not Found") #pprint(co.abstracts(data[14:18])) #abstracts = data.abstracts() #pprint(abstracts) ## finding abstracts print() print("Finding abstracts") #for x in data[100:5000]: # try: # pprint(co.abstract(x)) # except KeyError:
# Third party modules import cotools from cotools import abstract, text data = cotools.Paperset("data/all") digest = [ x for x in data if "digest" in cotools.text(x) or "digest" in cotools.abstract(x) ] cov = ["covid", "novel_coronavirus"] digest_covid = [ x for x in digest if any(c in text(x).lower() for c in cov) or any(c in abstract(x).lower() for c in cov) ] len(digest_covid) for d in digest_covid: print("-" * 55) print("\r\n") print("NEW PAPER") print("\r\n") print(abstract(d)) print(text(d))