def loadContextLines(self,line, all_lines, index, paper_data): """ Given a line with an annotated citation in the document, returns a fully extracted context for that citation Args: line: dict{sentiment,text} returns: list of lines to add, with text preprocessed to remove citations """ old_line=line["text"] line["text"]=re.sub(paper_citation_regex[paper_data["id"]],CIT_MARKER,line["text"]) if len(old_line) == len(line["text"]): ## print(old_line,"\n",paper_data["id"],"\n",paper_citation_regex[paper_data["id"]]) ## raise ValueError("Couldn't substitute citation!") # no citation found, no reason to do anything return None c_from=max(0,index-4) c_to=min(len(all_lines)-1,index+4) lines_to_add=deepcopy(all_lines[c_from:c_to]) for line in lines_to_add: line["text"]=removeURLs(removeACLCitations(line["text"])) return lines_to_add
def getOutlinkContextAtharAnnotated(context): """ Returns a context as annotated: list of tokens """ tokens=[] for line in context["lines"]: sent=line["sentiment"] if sent and ("p" in sent or "n" in sent or "o" in sent or "c" in sent): clean_line=removeURLs(line["line"]).replace(CIT_MARKER,"") clean_line=removeACLCitations(clean_line) tokens.extend(tokenizeText(clean_line)) tokens=[token for token in tokens if token not in punctuation] return tokens
def getOutlinkContextAtharWindowOfWords(context, left, right): """ Returns a window-of-words context: list of tokens """ context_text="".join([line["line"] for line in context["lines"]]) # remove URLS in text (normally footnotes and conversion erros) context_text=removeURLs(context_text) context_text=removeACLCitations(context_text) tokens=tokenizeText(context_text) tokens=[token for token in tokens if token not in punctuation] for index,token in enumerate(tokens): if token==CIT_MARKER: res=[] res.extend(tokens[index-left:index]) res.extend(tokens[index+1:index+right+1]) return res return None