def getDocBOWpassagesMulti(doc, parameters=[100], doctext=None): """ Get BOW for document using full text minus references and section titles Args: doc: full SciDoc to get text for Returns: multiple BOWs in a dictionary where the keys are the parameters """ if not doctext: doctext=doc.getFullDocumentText(doc, headers=True) doctext=removeCitations(doctext).lower() tokens=tokenizeText(doctext) res={} for param in parameters: res[param]=[] for i in xrange(0, len(tokens), param): bow={"text":unTokenize(tokens[i:i+param])} res[param].append(bow) for i in xrange(param/2, len(tokens), param): bow={"text":unTokenize(tokens[i:i+param])} res[param].append(bow) return res
def addDocBOWFullTextField(doc,res_dict,doctext=None): """ Adds the _full_text field """ if not doctext: doctext=doc.getFullDocumentText(doc) doctext=removeCitations(doctext).lower() tokens=tokenizeText(doctext) res_dict["_full_text"]=unTokenize(tokens)
def getDocBOWfull(doc, parameters=None, doctext=None): """ Get BOW for document using full text minus references and section titles Args: doc: full SciDoc to get text for """ if not doctext: doctext=doc.getFullDocumentText(doc) doctext=removeCitations(doctext).lower() tokens=tokenizeText(doctext) new_doc={"text":unTokenize(tokens)} return {1:[new_doc]} # all functions must take a list of parameters and return dict[parameter]=list of BOWs
def getDocBOWTitleAbstract(doc, parameters=None, doctext=None): """ Get BOW for document made up of only title and abstract """ paragraphs=[] doctext=doc["metadata"]["title"]+". " if len(doc.allsections) > 0: try: doctext+=" " + doc.getSectionText(doc.allsections[0],False) except: doctext+=u"<UNICODE ERROR>" doctext=removeCitations(doctext).lower() tokens=tokenizeText(doctext) return {1:[{"text":unTokenize(tokens)}]}