コード例 #1
0
ファイル: dataprep.py プロジェクト: sandialabs/galen-view
def make_index(paper_directory=".", num_docs=None):
    ''' Create a searchable index from the data set.

    Parameters
    ----------
    
    paper_directory : str
        A path to a directory where the data has been downloaded.

    num_docs : int
        The number of documents to analyze.  This is mostly for testing.
        Set this to a small number to analyze only the first num_docs 
        documents.
    
    returns : nothing
    '''
    allpapers = cotools.Paperset(paper_directory)
    ix = index.create_in(paper_directory, Cord19Schema)
    writer = ix.writer()
    for i in range(0, num_docs if num_docs is not None else len(allpapers)):
        if (i + 1) % 100 == 0:
            print(".", end='', flush=True)
        if (i + 1) % 1000 == 0:
            print()
        paper_title = allpapers[i]["metadata"]["title"]
        text_content = cotools.text(allpapers[i])
        if len(text_content.strip()) == 0:
            text_content = paper_title
        writer.add_document(paperid=i,
                            title=paper_title,
                            content=cotools.text(allpapers[i]))
    print("\nDone. Committing Index")
    writer.commit()
    print("Done Indexing")
コード例 #2
0
ファイル: dataprep.py プロジェクト: sandialabs/galen-view
def make_coords(paper_directory=".", num_docs=None, write_df=False):
    ''' Analyze the documents, creating a 2D layout for visualization.

    This funciton also optionally writes the DataFrame needed by the
    visualization.

    Parameters
    ----------

    paper_directory : str
        A path to a directory where the data has been downloaded.

    num_docs : int
        The number of documents to analyze.  This is mostly for testing.
        Set this to a small number to analyze only the first num_docs 
        documents.

    write_df : boolean
        Whether a pandas.DataFrame should be written in paper_directory
        that contains the titles and the coordinates.  If this is False,
        you must write the DataFrame yourself.  

    returns : the coordinates computed as a sparse numpy array.
    '''
    allpapers = cotools.Paperset(paper_directory)
    alltitles = []

    def each_paper_text(somepapers, range_min=0, range_max=None):
        for i in range(range_min,
                       len(somepapers) if range_max is None else range_max):
            alltitles.append(somepapers[i]["metadata"]["title"])
            yield (alltitles[-1] + "\n\n" + cotools.text(somepapers[i]))

    tfidf_vectorizer = TfidfVectorizer(min_df=20, stop_words='english')
    tfidf_vecs = tfidf_vectorizer.fit_transform(
        each_paper_text(allpapers, range_max=num_docs))
    tsne_vectorizer = TSNE()
    tsne_vecs = tsne_vectorizer.fit_transform(tfidf_vecs)
    if write_df:
        df = pd.DataFrame(tsne_vecs, columns=["X", "Y"])
        df["title"] = alltitles
        df.to_pickle(path.join(paper_directory, "metadata.df.pickle"))
    return tsne_vecs
コード例 #3
0
import cotools
from pprint import pprint


#cotools.download(dir="data")

noncomm = cotools.Paperset("data/noncomm_use_subset")

data = cotools.Paperset("data/comm_use_subset")
pprint(data[0])
print(type(data[0]))

# get the text for one feature
cotools.text(data[0])

cotools.texts(data[:15])

data.text()

data.apply(len)

# dict

pprint(data[:2])
print(type(data[2:5]))
# list

print(len(data))

# takes about 5gb in memory
alldata = data[:]
コード例 #4
0
import cotools as co
from pprint import pprint
import os
import sys

downloaded = True  # change me if you havent downloaded the data

if not downloaded:
    co.download(dir='data', match="2020-04-10", regex=True)

pprint(os.listdir('data'))

data = co.Paperset('data/custom_license')
print(str(sys.getsizeof(data)) + ' bytes')

print(f"{len(data)} papers")

print()
print("How data[index] looks like:")
pprint(data[13])

print()
print("How text looks like")
pprint(co.text(data[13]))

print()
print("How abstract looks like")
try:
    pprint(co.abstract(data[13]))
except KeyError:
    print("Abstract Not Found")
コード例 #5
0

if __name__ == '__main__':
    # Path to the CORD-19 dataset
    project_resources = Config.project_resources
    # Path where the annotated files will be saved
    path_output = Config.corpus_annotated
    pathlib.Path(os.path.dirname(project_resources)).mkdir(parents=True, exist_ok=True)
    pathlib.Path(os.path.dirname(path_output)).mkdir(parents=True, exist_ok=True)
    if Config.DOWNLOAD_CORPUS:
        cotools.download(dir=project_resources)
    wa = WrapperAnnotator()
    folders_corpus = ["pdf_json", "pmc_json"]

    for folder in folders_corpus:
        data = cotools.Paperset(project_resources + '/' + folder)

        # You may want to change the number of workers
        if Config.ENTITY_FISHING:
            with tqdm.tqdm(total=len(data)) as pbar:
                with concurrent.futures.ProcessPoolExecutor() as executor:
                    executor.map(func_entity_fishing, data)

        if Config.DBPEDIA_SPOTLIGHT:
            with tqdm.tqdm(total=len(data)) as pbar:
                with concurrent.futures.ProcessPoolExecutor() as executor:
                    executor.map(func_dbpedia_spotlight, data)

        if Config.NCBO_BIOPORTAL:
            with tqdm.tqdm(total=len(data)) as pbar:
                with concurrent.futures.ProcessPoolExecutor() as executor:
コード例 #6
0
from pprint import pprint

import cotools

# cotools.download(dir="data")

# noncomm = cotools.Paperset("data/noncomm_use_subset")

data = cotools.Paperset("data/custom_license")
# pprint(data[0])
# print(type(data[0]))

# get the text for one feature
cotools.text(data[0])

cotools.texts(data[:15])

import pdb

pdb.set_trace()  # XXX BREAKPOINT
data.apply(len)

# dict

# pprint(data[:2])
print(type(data[2:5]))
# list

print(len(data))

# takes about 5gb in memory
コード例 #7
0
ファイル: counter.py プロジェクト: MartinThoma/COVID_modeling
# Third party modules
import cotools
from cotools import abstract, text

data = cotools.Paperset("data/all")

digest = [
    x for x in data
    if "digest" in cotools.text(x) or "digest" in cotools.abstract(x)
]

cov = ["covid", "novel_coronavirus"]
digest_covid = [
    x for x in digest
    if any(c in text(x).lower() for c in cov) or any(c in abstract(x).lower()
                                                     for c in cov)
]

len(digest_covid)

for d in digest_covid:
    print("-" * 55)
    print("\r\n")
    print("NEW PAPER")
    print("\r\n")
    print(abstract(d))
    print(text(d))