Beispiel #1
0
def computeOverlapForAllReferences(doc, inverted_index):
    """
    """
    text1=doc.getFullDocumentText(True,False)
    tokens1=tokenizeText(text1)
    per_reference_tokens={}
    for ref in doc["references"]:
        match=cp.Corpus.matcher.matchReference(ref)
        if match:
            doc2=cp.Corpus.loadSciDoc(match["guid"])
            text2=doc2.getFullDocumentText(True,False)
            tokens2=tokenizeText(text2)
            per_reference_tokens[ref["id"]]=computeWordOverlap(text1,text2,inverted_index)

    return per_reference_tokens
Beispiel #2
0
def computeOverlapForAllReferences(doc, inverted_index):
    """
    """
    text1 = doc.getFullDocumentText(True, False)
    tokens1 = tokenizeText(text1)
    per_reference_tokens = {}
    for ref in doc["references"]:
        match = cp.Corpus.matcher.matchReference(ref)
        if match:
            doc2 = cp.Corpus.loadSciDoc(match["guid"])
            text2 = doc2.getFullDocumentText(True, False)
            tokens2 = tokenizeText(text2)
            per_reference_tokens[ref["id"]] = computeWordOverlap(
                text1, text2, inverted_index)

    return per_reference_tokens
Beispiel #3
0
 def getDocumentTokens(self, doc):
     """
         Returns a dict of all terms in the document
     """
     full_text=doc.getFullDocumentText(True,False)
     full_text=full_text.lower()
     tokens=tokenizeText(full_text)
     # remove many stopwords. hack!
     tokens=[token for token in tokens
         if token not in local_stopwords_list]
     counts=getDictOfTokenCounts(tokens)
     return counts
 def getDocumentTokens(self, doc):
     """
         Returns a dict of all terms in the document
     """
     full_text=doc.getFullDocumentText(True,False)
     full_text=full_text.lower()
     tokens=tokenizeText(full_text)
     # remove many stopwords. hack!
     tokens=[token for token in tokens
         if token not in local_stopwords_list]
     counts=getDictOfTokenCounts(tokens)
     return counts
Beispiel #5
0
def tokenizeAndTag(text,glob):
    """
        Function to be passed to SciDoc.prettyPrintHTML that will mark each token individually
    """
    glob["_token_counter"]=glob.get("_token_counter",0)
    glob["_inverted_index"]=glob.get("_inverted_index",defaultdict(lambda:set()))
    tokens=tokenizeText(text)
    res=[]
    for token in tokens:
        token_low=token.lower()
        glob["_inverted_index"][token_low].add(glob["_token_counter"])
        res.append('<span class="token" id="t'+str(glob["_token_counter"])+'">'+token+'</span>')
        glob["_token_counter"]+=1
    return " ".join(res)
Beispiel #6
0
def tokenizeAndTag(text, glob):
    """
        Function to be passed to SciDoc.prettyPrintHTML that will mark each token individually
    """
    glob["_token_counter"] = glob.get("_token_counter", 0)
    glob["_inverted_index"] = glob.get("_inverted_index",
                                       defaultdict(lambda: set()))
    tokens = tokenizeText(text)
    res = []
    for token in tokens:
        token_low = token.lower()
        glob["_inverted_index"][token_low].add(glob["_token_counter"])
        res.append('<span class="token" id="t' + str(glob["_token_counter"]) +
                   '">' + token + '</span>')
        glob["_token_counter"] += 1
    return " ".join(res)
Beispiel #7
0
    def textFormatting(self, text, glob):
        """
            Text formatting function to be passed to JsonDoc.prettyPrintDocumentHTML()

            Args:
                text: text of sentence
                glob: globals dictionary
        """
        res=[]
        text=text.strip(".")
        tokens=tokenizeText(text)
        for token in tokens:
            token_dict=self.term_info.get(token.lower(),
                {"token_id":0, "references":[]})
            references=" ".join(token_dict["references"])
            classes=str(token_dict["token_id"]) + " " + references
            res.append('<span term_id="%s" class="%s">%s</span>' %
                (str(token_dict["token_id"]),classes,token))

        result=" ".join(res).strip()
        return result.strip(".")
    def textFormatting(self, text, glob):
        """
            Text formatting function to be passed to JsonDoc.prettyPrintDocumentHTML()

            Args:
                text: text of sentence
                glob: globals dictionary
        """
        res=[]
        text=text.strip(".")
        tokens=tokenizeText(text)
        for token in tokens:
            token_dict=self.term_info.get(token.lower(),
                {"token_id":0, "references":[]})
            references=" ".join(token_dict["references"])
            classes=str(token_dict["token_id"]) + " " + references
            res.append('<span term_id="%s" class="%s">%s</span>' %
                (str(token_dict["token_id"]),classes,token))

        result=" ".join(res).strip()
        return result.strip(".")