Example #1
0
def make_index(paper_directory=".", num_docs=None):
    ''' Create a searchable index from the data set.

    Parameters
    ----------
    
    paper_directory : str
        A path to a directory where the data has been downloaded.

    num_docs : int
        The number of documents to analyze.  This is mostly for testing.
        Set this to a small number to analyze only the first num_docs 
        documents.
    
    returns : nothing
    '''
    allpapers = cotools.Paperset(paper_directory)
    ix = index.create_in(paper_directory, Cord19Schema)
    writer = ix.writer()
    for i in range(0, num_docs if num_docs is not None else len(allpapers)):
        if (i + 1) % 100 == 0:
            print(".", end='', flush=True)
        if (i + 1) % 1000 == 0:
            print()
        paper_title = allpapers[i]["metadata"]["title"]
        text_content = cotools.text(allpapers[i])
        if len(text_content.strip()) == 0:
            text_content = paper_title
        writer.add_document(paperid=i,
                            title=paper_title,
                            content=cotools.text(allpapers[i]))
    print("\nDone. Committing Index")
    writer.commit()
    print("Done Indexing")
Example #2
0
def func_ncbo(d):
    """
    Helper function for processing a paper in a thread with NCBO BioPortal Annotator+
    :param d: content of the paper
    :return: result of the annotation with NCBO BioPortal Annotator+ in JSON ||
     None if the JSON annotation exists already
    """
    d_json = {}
    paper_id = d['paper_id']
    title = d["metadata"]["title"]
    if os.path.isfile(path_output + '/ncbo/' + folder + '/' + paper_id +
                      '.json'):
        return None

    try:
        body_text = cotools.text(d)
        isreliable, textbytesfound, details, vectors = pycld2.detect(
            body_text, returnVectors=True)
        lang = vectors[0][3]
    # None or out of range
    except:
        lang = 'en'

    if os.path.isfile('/data/CORD19-Annotation-multi/entity-fishing/' +
                      folder + '/' + paper_id + '.json'):
        return None

    d_json["paper_id"] = paper_id
    d_json["lang"] = lang
    try:
        abstract = cotools.abstract(d)
        d_json["abstract"] = wa.request_ncbo_plus(abstract, lang)
    # no abstract
    except:
        pass

    body_text = cotools.text(d)

    d_json["paper_id"] = paper_id
    d_json["title"] = wa.request_ncbo_plus(title, lang)
    d_json["body_text"] = wa.request_ncbo_plus(body_text, lang)
    d_json["ref_entries"] = {}
    for key, value in d["ref_entries"].items():
        d_json["ref_entries"][key] = wa.request_ncbo_plus(value["text"], lang)

    d_json["back_matter"] = []
    for matter in d["back_matter"]:
        for key, value in matter.items():
            if key == 'text':
                text = {'text': wa.request_ncbo_plus(value)}
                d_json["back_matter"].append(text)
    #"""
    pbar.update()
    Output().save_json(
        d_json,
        path_output + '/ncbo/' + folder + '/' + d["paper_id"] + '.json')
    return d_json
    def index(self, docs):
        """
        Index the documents using the scoring method
        Input: The Paperset lazy loader containing the documents
        """
        num_docs = len(docs)
        self.num_docs = num_docs
        for i in range(num_docs):
            doc = doc[i]
            # get the text
            doc_text = cotools.text(doc)
            tokens, tags = _get_doc_tokens_and_tags(doc_text)

            # update the number of times a token appears
            self.word_freq.update(tokens)

            # update the number of times a token appears in a document
            # use set to get unique tokens only
            self.doc_freq.update(set(tokens))

            if tags is not None:
                # update the list of tags
                self.tags.update(tags.split())

        self.total_tokens = sum(self.word_freq.values())

        # number of unique tokens
        num_tokens = len(self.word_freq.values())
        # avg frequency per token
        self.avg_freq = self.total_tokens / num_tokens

        # average doc length
        self.avg_doc_len = self.total_tokens / self.num_docs

        # compute scores using the scoring method
        for token, freq in self.doc_freq.items():
            #TODO: implement compute_score
            self.scores[token] = self.compute_score(freq)

        self.avg_score[token] = sum(self.scores.values()) / len(self.scores)

        # filter for tags that appear in at least 1% of documents
        _filter_tags(.009)
    def weights(self, doc):
        """
        Build the weight vector for the tokens in the document
        Input: 
        """

        weights = list()
        # get the text
        doc_text = cotools.text(doc)
        tokens, _ = _get_doc_tokens_and_tags(doc_text)

        num_tokens = len(tokens)

        for token in tokens:
            # get the frequency and score for the token
            freq = self.word_freq.get(token)
            score = self.scores.get(token)
            if not freq:
                freq = self.avg_freq
            if not score:
                score = self.avg_score

            token_weight = self._get_token_weight(freq, score, num_tokens)
            weights.append(token_weight)

        # boost weights of tag tokens to equal the largest weight in the list
        if self.tags:
            tags = {
                token: self.tags[token]
                for token in tokens if token in self.tags
            }
            if tags:
                max_weight = max(weights)
                max_tag = max(tags.values())
                weights = [
                    max(max_weight * (tags[tokens[x]] / max_tag), weight)
                    if tokens[x] in tags else weight
                    for x, weight in enumerate(weights)
                ]

        return weights
Example #5
0
import cotools
from pprint import pprint


#cotools.download(dir="data")

noncomm = cotools.Paperset("data/noncomm_use_subset")

data = cotools.Paperset("data/comm_use_subset")
pprint(data[0])
print(type(data[0]))

# get the text for one feature
cotools.text(data[0])

cotools.texts(data[:15])

data.text()

data.apply(len)

# dict

pprint(data[:2])
print(type(data[2:5]))
# list

print(len(data))

# takes about 5gb in memory
alldata = data[:]
Example #6
0
    co.download(dir='data', match="2020-04-10", regex=True)

pprint(os.listdir('data'))

data = co.Paperset('data/custom_license')
print(str(sys.getsizeof(data)) + ' bytes')

print(f"{len(data)} papers")

print()
print("How data[index] looks like:")
pprint(data[13])

print()
print("How text looks like")
pprint(co.text(data[13]))

print()
print("How abstract looks like")
try:
    pprint(co.abstract(data[13]))
except KeyError:
    print("Abstract Not Found")

#pprint(co.abstracts(data[14:18]))

#abstracts = data.abstracts()
#pprint(abstracts)

## finding abstracts
print()
Example #7
0
 def each_paper_text(somepapers, range_min=0, range_max=None):
     for i in range(range_min,
                    len(somepapers) if range_max is None else range_max):
         alltitles.append(somepapers[i]["metadata"]["title"])
         yield (alltitles[-1] + "\n\n" + cotools.text(somepapers[i]))
Example #8
0
# Third party modules
import cotools
from cotools import abstract, text

data = cotools.Paperset("data/all")

digest = [
    x for x in data
    if "digest" in cotools.text(x) or "digest" in cotools.abstract(x)
]

cov = ["covid", "novel_coronavirus"]
digest_covid = [
    x for x in digest
    if any(c in text(x).lower() for c in cov) or any(c in abstract(x).lower()
                                                     for c in cov)
]

len(digest_covid)

for d in digest_covid:
    print("-" * 55)
    print("\r\n")
    print("NEW PAPER")
    print("\r\n")
    print(abstract(d))
    print(text(d))