コード例 #1
0
ファイル: athar_corpus.py プロジェクト: danieldmm/minerva
    def loadContextLines(self,line, all_lines, index, paper_data):
        """
            Given a line with an annotated citation in the document, returns a
            fully extracted context for that citation

            Args:
                line: dict{sentiment,text}
            returns:
                list of lines to add, with text preprocessed to remove citations
        """
        old_line=line["text"]
        line["text"]=re.sub(paper_citation_regex[paper_data["id"]],CIT_MARKER,line["text"])

        if len(old_line) == len(line["text"]):
##            print(old_line,"\n",paper_data["id"],"\n",paper_citation_regex[paper_data["id"]])
##            raise ValueError("Couldn't substitute citation!")
            # no citation found, no reason to do anything
            return None
        c_from=max(0,index-4)
        c_to=min(len(all_lines)-1,index+4)

        lines_to_add=deepcopy(all_lines[c_from:c_to])
        for line in lines_to_add:
            line["text"]=removeURLs(removeACLCitations(line["text"]))

        return lines_to_add
コード例 #2
0
ファイル: athar_corpus.py プロジェクト: danieldmm/minerva
def getOutlinkContextAtharAnnotated(context):
    """
        Returns a context as annotated: list of tokens
    """
    tokens=[]
    for line in context["lines"]:
        sent=line["sentiment"]
        if sent and ("p" in sent or "n" in sent or "o" in sent or "c" in sent):
            clean_line=removeURLs(line["line"]).replace(CIT_MARKER,"")
            clean_line=removeACLCitations(clean_line)
            tokens.extend(tokenizeText(clean_line))
    tokens=[token for token in tokens if token not in punctuation]
    return tokens
コード例 #3
0
ファイル: athar_corpus.py プロジェクト: danieldmm/minerva
def getOutlinkContextAtharWindowOfWords(context, left, right):
    """
        Returns a window-of-words context: list of tokens
    """
    context_text="".join([line["line"] for line in context["lines"]])
    # remove URLS in text (normally footnotes and conversion erros)
    context_text=removeURLs(context_text)
    context_text=removeACLCitations(context_text)
    tokens=tokenizeText(context_text)
    tokens=[token for token in tokens if token not in punctuation]
    for index,token in enumerate(tokens):
        if token==CIT_MARKER:
            res=[]
            res.extend(tokens[index-left:index])
            res.extend(tokens[index+1:index+right+1])
            return res
    return None