Ejemplo n.º 1
0
def getDocBOWannotatedSections(doc, parameters=None, doctext=None):
    """
        Returns a dict where each key should be a field for the document in
        the Lucene index
        returns {"title","abstract","text"}
    """
    paragraphs=[]
    res={}
    res["title"]=doc["metadata"]["title"]
    res["abstract"]=""

    if len(doc.allsections) > 0 :
        try:
            res["abstract"]=doc.getSectionText(doc.allsections[0],False)
            res["abstract"]==removeCitations(res["abstract"]).lower()
        except:
            res["abstract"]=u"<UNICODE ERROR>"

    paper_text=""
    for index in range(1,len(doc.allsections),1):
        paper_text+=" "+doc.getSectionText(doc.allsections[index],False)

    res["text"]=removeCitations(paper_text).lower()
    addDocBOWFullTextField(doc,res,doctext)
    return {1:[res]}
Ejemplo n.º 2
0
def processContext(match,doctext, wleft, wright):
    """
        Optimize the extraction of context by limiting the amount of characters and pre-tokenizing
    """
    left_start=max(0,match.start()-((wleft+15)*ESTIMATED_AVERAGE_WORD_LENGTH))
    right_end=match.end()+((wright+15)*ESTIMATED_AVERAGE_WORD_LENGTH)

    left=doctext[left_start:match.start()] # tokenize!
    left=tokenizeText(removeCitations(left))
    left=removePunctuation(left)

    right=doctext[match.end():right_end] # tokenize!
    right=tokenizeText(removeCitations(right))
    right=removePunctuation(right)

    return {"left":left,"right":right,"left_start":left_start,"right_end":right_end}
Ejemplo n.º 3
0
def getDocBOWpassagesMulti(doc, parameters=[100], doctext=None):
    """
        Get BOW for document using full text minus references and section titles

        Args:
            doc: full SciDoc to get text for
        Returns:
             multiple BOWs in a dictionary where the keys are the parameters
    """

    if not doctext:
        doctext=doc.getFullDocumentText(doc, headers=True)

    doctext=removeCitations(doctext).lower()
    tokens=tokenizeText(doctext)
    res={}

    for param in parameters:
        res[param]=[]

        for i in xrange(0, len(tokens), param):
            bow={"text":unTokenize(tokens[i:i+param])}
            res[param].append(bow)

        for i in xrange(param/2, len(tokens), param):
            bow={"text":unTokenize(tokens[i:i+param])}
            res[param].append(bow)

    return res
Ejemplo n.º 4
0
def addDocBOWFullTextField(doc,res_dict,doctext=None):
    """
        Adds the _full_text field
    """
    if not doctext:
        doctext=doc.getFullDocumentText(doc)
    doctext=removeCitations(doctext).lower()
    tokens=tokenizeText(doctext)
    res_dict["_full_text"]=unTokenize(tokens)
Ejemplo n.º 5
0
def getDocBOWfull(doc, parameters=None, doctext=None):
    """
        Get BOW for document using full text minus references and section titles

        Args:
            doc: full SciDoc to get text for
    """
    if not doctext:
        doctext=doc.getFullDocumentText(doc)
    doctext=removeCitations(doctext).lower()
    tokens=tokenizeText(doctext)
    new_doc={"text":unTokenize(tokens)}
    return {1:[new_doc]} # all functions must take a list of parameters and return dict[parameter]=list of BOWs
Ejemplo n.º 6
0
def getDocBOWTitleAbstract(doc, parameters=None, doctext=None):
    """
        Get BOW for document made up of only title and abstract
    """
    paragraphs=[]
    doctext=doc["metadata"]["title"]+". "
    if len(doc.allsections) > 0:
        try:
            doctext+=" " + doc.getSectionText(doc.allsections[0],False)
        except:
            doctext+=u"<UNICODE ERROR>"
    doctext=removeCitations(doctext).lower()
    tokens=tokenizeText(doctext)
    return {1:[{"text":unTokenize(tokens)}]}