def make_index(paper_directory=".", num_docs=None): ''' Create a searchable index from the data set. Parameters ---------- paper_directory : str A path to a directory where the data has been downloaded. num_docs : int The number of documents to analyze. This is mostly for testing. Set this to a small number to analyze only the first num_docs documents. returns : nothing ''' allpapers = cotools.Paperset(paper_directory) ix = index.create_in(paper_directory, Cord19Schema) writer = ix.writer() for i in range(0, num_docs if num_docs is not None else len(allpapers)): if (i + 1) % 100 == 0: print(".", end='', flush=True) if (i + 1) % 1000 == 0: print() paper_title = allpapers[i]["metadata"]["title"] text_content = cotools.text(allpapers[i]) if len(text_content.strip()) == 0: text_content = paper_title writer.add_document(paperid=i, title=paper_title, content=cotools.text(allpapers[i])) print("\nDone. Committing Index") writer.commit() print("Done Indexing")
def make_coords(paper_directory=".", num_docs=None, write_df=False): ''' Analyze the documents, creating a 2D layout for visualization. This funciton also optionally writes the DataFrame needed by the visualization. Parameters ---------- paper_directory : str A path to a directory where the data has been downloaded. num_docs : int The number of documents to analyze. This is mostly for testing. Set this to a small number to analyze only the first num_docs documents. write_df : boolean Whether a pandas.DataFrame should be written in paper_directory that contains the titles and the coordinates. If this is False, you must write the DataFrame yourself. returns : the coordinates computed as a sparse numpy array. ''' allpapers = cotools.Paperset(paper_directory) alltitles = [] def each_paper_text(somepapers, range_min=0, range_max=None): for i in range(range_min, len(somepapers) if range_max is None else range_max): alltitles.append(somepapers[i]["metadata"]["title"]) yield (alltitles[-1] + "\n\n" + cotools.text(somepapers[i])) tfidf_vectorizer = TfidfVectorizer(min_df=20, stop_words='english') tfidf_vecs = tfidf_vectorizer.fit_transform( each_paper_text(allpapers, range_max=num_docs)) tsne_vectorizer = TSNE() tsne_vecs = tsne_vectorizer.fit_transform(tfidf_vecs) if write_df: df = pd.DataFrame(tsne_vecs, columns=["X", "Y"]) df["title"] = alltitles df.to_pickle(path.join(paper_directory, "metadata.df.pickle")) return tsne_vecs
import cotools from pprint import pprint #cotools.download(dir="data") noncomm = cotools.Paperset("data/noncomm_use_subset") data = cotools.Paperset("data/comm_use_subset") pprint(data[0]) print(type(data[0])) # get the text for one feature cotools.text(data[0]) cotools.texts(data[:15]) data.text() data.apply(len) # dict pprint(data[:2]) print(type(data[2:5])) # list print(len(data)) # takes about 5gb in memory alldata = data[:]
import cotools as co from pprint import pprint import os import sys downloaded = True # change me if you havent downloaded the data if not downloaded: co.download(dir='data', match="2020-04-10", regex=True) pprint(os.listdir('data')) data = co.Paperset('data/custom_license') print(str(sys.getsizeof(data)) + ' bytes') print(f"{len(data)} papers") print() print("How data[index] looks like:") pprint(data[13]) print() print("How text looks like") pprint(co.text(data[13])) print() print("How abstract looks like") try: pprint(co.abstract(data[13])) except KeyError: print("Abstract Not Found")
if __name__ == '__main__': # Path to the CORD-19 dataset project_resources = Config.project_resources # Path where the annotated files will be saved path_output = Config.corpus_annotated pathlib.Path(os.path.dirname(project_resources)).mkdir(parents=True, exist_ok=True) pathlib.Path(os.path.dirname(path_output)).mkdir(parents=True, exist_ok=True) if Config.DOWNLOAD_CORPUS: cotools.download(dir=project_resources) wa = WrapperAnnotator() folders_corpus = ["pdf_json", "pmc_json"] for folder in folders_corpus: data = cotools.Paperset(project_resources + '/' + folder) # You may want to change the number of workers if Config.ENTITY_FISHING: with tqdm.tqdm(total=len(data)) as pbar: with concurrent.futures.ProcessPoolExecutor() as executor: executor.map(func_entity_fishing, data) if Config.DBPEDIA_SPOTLIGHT: with tqdm.tqdm(total=len(data)) as pbar: with concurrent.futures.ProcessPoolExecutor() as executor: executor.map(func_dbpedia_spotlight, data) if Config.NCBO_BIOPORTAL: with tqdm.tqdm(total=len(data)) as pbar: with concurrent.futures.ProcessPoolExecutor() as executor:
from pprint import pprint import cotools # cotools.download(dir="data") # noncomm = cotools.Paperset("data/noncomm_use_subset") data = cotools.Paperset("data/custom_license") # pprint(data[0]) # print(type(data[0])) # get the text for one feature cotools.text(data[0]) cotools.texts(data[:15]) import pdb pdb.set_trace() # XXX BREAKPOINT data.apply(len) # dict # pprint(data[:2]) print(type(data[2:5])) # list print(len(data)) # takes about 5gb in memory
# Third party modules import cotools from cotools import abstract, text data = cotools.Paperset("data/all") digest = [ x for x in data if "digest" in cotools.text(x) or "digest" in cotools.abstract(x) ] cov = ["covid", "novel_coronavirus"] digest_covid = [ x for x in digest if any(c in text(x).lower() for c in cov) or any(c in abstract(x).lower() for c in cov) ] len(digest_covid) for d in digest_covid: print("-" * 55) print("\r\n") print("NEW PAPER") print("\r\n") print(abstract(d)) print(text(d))