def make_index(paper_directory=".", num_docs=None): ''' Create a searchable index from the data set. Parameters ---------- paper_directory : str A path to a directory where the data has been downloaded. num_docs : int The number of documents to analyze. This is mostly for testing. Set this to a small number to analyze only the first num_docs documents. returns : nothing ''' allpapers = cotools.Paperset(paper_directory) ix = index.create_in(paper_directory, Cord19Schema) writer = ix.writer() for i in range(0, num_docs if num_docs is not None else len(allpapers)): if (i + 1) % 100 == 0: print(".", end='', flush=True) if (i + 1) % 1000 == 0: print() paper_title = allpapers[i]["metadata"]["title"] text_content = cotools.text(allpapers[i]) if len(text_content.strip()) == 0: text_content = paper_title writer.add_document(paperid=i, title=paper_title, content=cotools.text(allpapers[i])) print("\nDone. Committing Index") writer.commit() print("Done Indexing")
def func_ncbo(d): """ Helper function for processing a paper in a thread with NCBO BioPortal Annotator+ :param d: content of the paper :return: result of the annotation with NCBO BioPortal Annotator+ in JSON || None if the JSON annotation exists already """ d_json = {} paper_id = d['paper_id'] title = d["metadata"]["title"] if os.path.isfile(path_output + '/ncbo/' + folder + '/' + paper_id + '.json'): return None try: body_text = cotools.text(d) isreliable, textbytesfound, details, vectors = pycld2.detect( body_text, returnVectors=True) lang = vectors[0][3] # None or out of range except: lang = 'en' if os.path.isfile('/data/CORD19-Annotation-multi/entity-fishing/' + folder + '/' + paper_id + '.json'): return None d_json["paper_id"] = paper_id d_json["lang"] = lang try: abstract = cotools.abstract(d) d_json["abstract"] = wa.request_ncbo_plus(abstract, lang) # no abstract except: pass body_text = cotools.text(d) d_json["paper_id"] = paper_id d_json["title"] = wa.request_ncbo_plus(title, lang) d_json["body_text"] = wa.request_ncbo_plus(body_text, lang) d_json["ref_entries"] = {} for key, value in d["ref_entries"].items(): d_json["ref_entries"][key] = wa.request_ncbo_plus(value["text"], lang) d_json["back_matter"] = [] for matter in d["back_matter"]: for key, value in matter.items(): if key == 'text': text = {'text': wa.request_ncbo_plus(value)} d_json["back_matter"].append(text) #""" pbar.update() Output().save_json( d_json, path_output + '/ncbo/' + folder + '/' + d["paper_id"] + '.json') return d_json
def index(self, docs): """ Index the documents using the scoring method Input: The Paperset lazy loader containing the documents """ num_docs = len(docs) self.num_docs = num_docs for i in range(num_docs): doc = doc[i] # get the text doc_text = cotools.text(doc) tokens, tags = _get_doc_tokens_and_tags(doc_text) # update the number of times a token appears self.word_freq.update(tokens) # update the number of times a token appears in a document # use set to get unique tokens only self.doc_freq.update(set(tokens)) if tags is not None: # update the list of tags self.tags.update(tags.split()) self.total_tokens = sum(self.word_freq.values()) # number of unique tokens num_tokens = len(self.word_freq.values()) # avg frequency per token self.avg_freq = self.total_tokens / num_tokens # average doc length self.avg_doc_len = self.total_tokens / self.num_docs # compute scores using the scoring method for token, freq in self.doc_freq.items(): #TODO: implement compute_score self.scores[token] = self.compute_score(freq) self.avg_score[token] = sum(self.scores.values()) / len(self.scores) # filter for tags that appear in at least 1% of documents _filter_tags(.009)
def weights(self, doc): """ Build the weight vector for the tokens in the document Input: """ weights = list() # get the text doc_text = cotools.text(doc) tokens, _ = _get_doc_tokens_and_tags(doc_text) num_tokens = len(tokens) for token in tokens: # get the frequency and score for the token freq = self.word_freq.get(token) score = self.scores.get(token) if not freq: freq = self.avg_freq if not score: score = self.avg_score token_weight = self._get_token_weight(freq, score, num_tokens) weights.append(token_weight) # boost weights of tag tokens to equal the largest weight in the list if self.tags: tags = { token: self.tags[token] for token in tokens if token in self.tags } if tags: max_weight = max(weights) max_tag = max(tags.values()) weights = [ max(max_weight * (tags[tokens[x]] / max_tag), weight) if tokens[x] in tags else weight for x, weight in enumerate(weights) ] return weights
import cotools from pprint import pprint #cotools.download(dir="data") noncomm = cotools.Paperset("data/noncomm_use_subset") data = cotools.Paperset("data/comm_use_subset") pprint(data[0]) print(type(data[0])) # get the text for one feature cotools.text(data[0]) cotools.texts(data[:15]) data.text() data.apply(len) # dict pprint(data[:2]) print(type(data[2:5])) # list print(len(data)) # takes about 5gb in memory alldata = data[:]
co.download(dir='data', match="2020-04-10", regex=True) pprint(os.listdir('data')) data = co.Paperset('data/custom_license') print(str(sys.getsizeof(data)) + ' bytes') print(f"{len(data)} papers") print() print("How data[index] looks like:") pprint(data[13]) print() print("How text looks like") pprint(co.text(data[13])) print() print("How abstract looks like") try: pprint(co.abstract(data[13])) except KeyError: print("Abstract Not Found") #pprint(co.abstracts(data[14:18])) #abstracts = data.abstracts() #pprint(abstracts) ## finding abstracts print()
def each_paper_text(somepapers, range_min=0, range_max=None): for i in range(range_min, len(somepapers) if range_max is None else range_max): alltitles.append(somepapers[i]["metadata"]["title"]) yield (alltitles[-1] + "\n\n" + cotools.text(somepapers[i]))
# Third party modules import cotools from cotools import abstract, text data = cotools.Paperset("data/all") digest = [ x for x in data if "digest" in cotools.text(x) or "digest" in cotools.abstract(x) ] cov = ["covid", "novel_coronavirus"] digest_covid = [ x for x in digest if any(c in text(x).lower() for c in cov) or any(c in abstract(x).lower() for c in cov) ] len(digest_covid) for d in digest_covid: print("-" * 55) print("\r\n") print("NEW PAPER") print("\r\n") print(abstract(d)) print(text(d))