コード例 #1
0
def pdf2refs(opts, args):
    """."""
    global url_re
    xmltag = True
    highlight = False
    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
    tree = pdf2etree(args)
    pubs = []
    urls = []
    try:
        xps = tree.xpath('//BLOCK')
    except AttributeError:
        return tree
    hit_ref = 0
    refs = []
    for el in xps:
        origtxts = []
        for el2 in el.iter():
            try: origtxts.append(el2.text.strip())
            except AttributeError: pass
            if el2 != el and el2.tail is not None:
                origtxts.append(el2.tail.strip())
        origtxt = ' '.join(origtxts)
        if not len(origtxt):
            continue
        elif origtxt.strip().startswith(('Reference', 'REFERENCE')) or origtxt.find('Reference') > 0 or origtxt[:20].find('REFERENCE') > 0:
            hit_ref = 1
            continue
        elif hit_ref:
            refs.append(origtxt)

    for ref in split_refs('\n'.join(refs)):
        for url in url_re.findall(ref):
            urls.append(url[0])

        pubbits = []
        for pubnode in el.xpath(".//TOKEN[@italic='yes']"):
            pubtxt = etree.tostring(pubnode, method='text', encoding="UTF-8")
            pubbits.append(pubtxt)

        if len(pubbits): pubs.append(' '.join(pubbits))
        if xmltag:
            ref = tag_ref(ref, highlight)
        sys.stdout.write(ref + '\n')
        sys.stdout.flush()

    if len(pubs):
        sys.stdout.write('-'*10 + "\nCited Publications\n" + '-'*10 + '\n')
        for pub in pubs:
            sys.stdout.write(pub + '\n')
            sys.stdout.flush()
    if len(urls):
        sys.stdout.write('-'*10 + "\nCited URLs\n" + '-'*10 + '\n')
        for url in urls:
            sys.stdout.write(url + '\n')
            sys.stdout.flush()
コード例 #2
0
ファイル: references.py プロジェクト: Big-Data/pdfssa4met
def pdf2refs(opts, args):
    """."""
    global url_re
    xmltag = True
    highlight = False
    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
    tree = pdf2etree(args)
    pubs = []
    urls = []
    try:
        xps = tree.xpath('//BLOCK')
    except AttributeError:
        return tree
    hit_ref = 0
    refs = []
    for el in xps:
        origtxts = []
        for el2 in el.iter():
            try: origtxts.append(el2.text.strip())
            except AttributeError: pass
            if el2 != el and el2.tail is not None:
                origtxts.append(el2.tail.strip())
        origtxt = ' '.join(origtxts)
        if not len(origtxt):
            continue
        elif origtxt.strip().startswith(('Reference', 'REFERENCE')) or origtxt.find('Reference') > 0 or origtxt[:20].find('REFERENCE') > 0:
            hit_ref = 1
            continue
        elif hit_ref:
            refs.append(origtxt)

    for ref in split_refs('\n'.join(refs)):
        for url in url_re.findall(ref):
            urls.append(url[0])

        pubbits = []
        for pubnode in el.xpath(".//TOKEN[@italic='yes']"):
            pubtxt = etree.tostring(pubnode, method='text', encoding="UTF-8")
            pubbits.append(pubtxt)

        if len(pubbits): pubs.append(' '.join(pubbits))
        if xmltag:
            ref = tag_ref(ref, highlight)
        sys.stdout.write(ref + '\n')
        sys.stdout.flush()

    if len(pubs):
        sys.stdout.write('-'*10 + "\nCited Publications\n" + '-'*10 + '\n')
        for pub in pubs:
            sys.stdout.write(pub + '\n')
            sys.stdout.flush()
    if len(urls):
        sys.stdout.write('-'*10 + "\nCited URLs\n" + '-'*10 + '\n')
        for url in urls:
            sys.stdout.write(url + '\n')
            sys.stdout.flush()
コード例 #3
0
ファイル: socialtags.py プロジェクト: Big-Data/pdfssa4met
def opencalaistags(opts, args):
    global api_key
    tree = pdf2etree(args)
    # could do something more sophisticated, but for now use full text
    full_text = ' '.join([etree.tostring(el, method="text", encoding="UTF-8") for el in tree.xpath('//TOKEN')])
    oc = OpenCalaisService("http://api1.opencalais.com/enlighten/rest/", api_key, "PDF SSA4MET Open Calais Tagger")
    ft_graph = oc.rdfFromText(full_text)

#    for t in oc.entitiesFromRdf(ft_graph):
#        print str(t)

#    print '-'*10,"\nEntities from Open Calais\n",'-'*10
#    for t in oc.entitiesFromRdf(ft_graph):
#        print str(t)

#    print '-'*10,"\nSocial Tags from Open Calais\n",'-'*10
    for tn, uri in oc.tagsFromRdf(ft_graph):
        sys.stdout.writelines([str(tn).ljust(35), str(uri), '\n'])
        sys.stdout.flush()
    return 0
def opencalaistags(opts, args):
    global api_key
    tree = pdf2etree(args)
    # could do something more sophisticated, but for now use full text
    full_text = ' '.join([
        etree.tostring(el, method="text", encoding="UTF-8")
        for el in tree.xpath('//TOKEN')
    ])
    oc = OpenCalaisService("http://api1.opencalais.com/enlighten/rest/",
                           api_key, "PDF SSA4MET Open Calais Tagger")
    ft_graph = oc.rdfFromText(full_text)

    #    for t in oc.entitiesFromRdf(ft_graph):
    #        print str(t)

    #    print '-'*10,"\nEntities from Open Calais\n",'-'*10
    #    for t in oc.entitiesFromRdf(ft_graph):
    #        print str(t)

    #    print '-'*10,"\nSocial Tags from Open Calais\n",'-'*10
    for tn, uri in oc.tagsFromRdf(ft_graph):
        sys.stdout.writelines([str(tn).ljust(35), str(uri), '\n'])
        sys.stdout.flush()
    return 0
コード例 #5
0
ファイル: headings.py プロジェクト: Astromis/pdfssa4met
def pdf2heads(opts, args):
    xmltag = True
    highlight = False
    titleonly = False
    authonly = False
    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
        if (o == '--title'):
            titleonly = True
        elif (o == '--author'):
            authonly = True

        tree = pdf2etree(args)

    # find title
    page = 1
    block = 1
    title_node = None
    while True:
        try:
            title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
        except IndexError:
            page += 1
        else:
            break
        if page > 2:
            # probably not going to find it now
            break

    # find author
    page = 1
    block = 2
    auth_node = None
    while True:
        try:
            auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
        except InbdexError:
            block += 1
        else:
            break
        if block > 4:
            # probably not going to find it now
            break

    font_sizes = tree.xpath('//TOKEN/@font-size')
    mean_font_size = mean(font_sizes)
    median_font_size = median(font_sizes)

    #print "Median Font Size (i.e. body text):", median_font_size

    font_colors = tree.xpath('//TOKEN/@font-color')
    font_color_hash = {}
    for fc in font_colors:
        try:
            font_color_hash[fc] += 1
        except KeyError:
            font_color_hash[fc] = 1

    sortlist = [(v, k) for k, v in font_color_hash.iteritems()]
    sortlist.sort(reverse=True)
    main_font_color = sortlist[0][1]
    head_txts = []
    stop = False
    for page_node in tree.xpath('//PAGE'):
        for block_node in page_node.xpath('.//BLOCK'):
            if xmltag:
                if block_node == title_node:
                    st = "<title>"
                    et = "</title>"
                elif block_node == auth_node:
                    st = "<author>"
                    et = "</author>"
                else:
                    st = "<heading>"
                    et = "</heading>"

                if highlight:
                    st = "\033[0;32m{0}\033[0m".format(st)
                    et = "\033[0;32m{0}\033[0m".format(et)
            else:
                st = et = ""
            if block_node == title_node and authonly:
                continue
            headers = block_node.xpath(
                ".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']"
                .format(mean_font_size * 1.05, main_font_color))
            head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in headers
            ])
            if len(head_txt):
                head_txts.append("{0}{1}{2}".format(st, head_txt, et))

            if block_node == title_node and titleonly:
                stop = True
                break
            elif block_node == auth_node and authonly:
                stop = True
                break
        if stop:
            break
    for txt in head_txts:
        sys.stdout.writelines([txt, '\n'])
コード例 #6
0
def pdf2heads(opts, args):
    xmltag = True
    highlight = False
    titleonly = False
    authonly = False
    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
        if (o == '--title'):
            titleonly = True
        elif (o == '--author'):
            authonly = True

    tree = pdf2etree(args)

    # find title
    page = 1
    block = 1
    title_node = None
    while True:
        try: title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0]
        except IndexError: page+=1
        else: break
        if page > 2:
            # probably not going to find it now
            break
        
    # find author
    page = 1
    block = 2
    auth_node = None
    while True:
        try: auth_node  = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0]
        except InbdexError: block+=1
        else: break
        if block > 4:
            # probably not going to find it now
            break
    
    font_sizes = tree.xpath('//TOKEN/@font-size')
    mean_font_size =  mean(font_sizes)
    median_font_size = median(font_sizes)

    #print "Median Font Size (i.e. body text):", median_font_size

    font_colors = tree.xpath('//TOKEN/@font-color')
    font_color_hash = {}
    for fc in font_colors:
        try:
            font_color_hash[fc]+=1
        except KeyError:
            font_color_hash[fc] = 1

    sortlist = [(v,k) for k,v in font_color_hash.iteritems()]
    sortlist.sort(reverse=True)
    main_font_color = sortlist[0][1]
    head_txts = []
    stop = False
    for page_node in tree.xpath('//PAGE'):
        for block_node in page_node.xpath('.//BLOCK'):
            if xmltag:
                if block_node == title_node:
                    st = "<title>"
                    et = "</title>"
                elif block_node == auth_node:
                    st = "<author>"
                    et = "</author>"
                else:
                    st = "<heading>"
                    et = "</heading>"
                    
                if highlight:
                    st = "\033[0;32m{0}\033[0m".format(st)
                    et = "\033[0;32m{0}\033[0m".format(et)
            else:
                st = et = ""
            if block_node == title_node and authonly:
                continue
            headers = block_node.xpath(".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']".format(mean_font_size*1.05, main_font_color))
            head_txt = ' '.join([etree.tostring(el, method='text', encoding="UTF-8") for el in headers])
            if len(head_txt):
                head_txts.append("{0}{1}{2}".format(st, head_txt, et))
                
            if block_node == title_node and titleonly:
                stop = True
                break
            elif block_node == auth_node and authonly:
                stop = True
                break
        if stop:
            break
    for txt in head_txts:
        sys.stdout.writelines([txt, '\n'])
コード例 #7
0
def pdf2heads(opts, args):
    xmltag = True
    highlight = False
    titleonly = False
    authonly = False
    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
            if (o == '--title'):
                titleonly = True
            elif (o == '--author'):
                authonly = True

    tree = pdf2etree(args)

    # find title
    page = 1
    block = 1
    title_node = None
    while True:
        try: title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0]
    except IndexError: page+=1
else: break
if page > 2:
    # probably not going to find it now
    break

    # find author
    page = 1
    block = 2
    auth_node = None
    while True:
        try: auth_node  = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0]
    except InbdexError: block+=1
else: break
if block > 4:
    # probably not going to find it now
    break

    font_sizes = tree.xpath('//TOKEN/@font-size')
    mean_font_size =  mean(font_sizes)
    median_font_size = median(font_sizes)

    #print "Median Font Size (i.e. body text):", median_font_size

    font_colors = tree.xpath('//TOKEN/@font-color')
    font_color_hash = {}
    for fc in font_colors:
        try:
            font_color_hash[fc]+=1
        except KeyError:
            font_color_hash[fc] = 1

    sortlist = [(v,k) for k,v in font_color_hash.iteritems()]
    sortlist.sort(reverse=True)
    main_font_color = sortlist[0][1]
    head_txts = []
    stop = False
    for page_node in tree.xpath('//PAGE'):
        for block_node in page_node.xpath('.//BLOCK'):
            if xmltag:
                if block_node == title_node:
                    st = "<title>"
                    et = "</title>"
                elif block_node == auth_node:
                    st = "<author>"
                    et = "</author>"
                else:
                    st = "<heading>"
                    et = "</heading>"

                if highlight:
                    st = "\033[0;32m{0}\033[0m".format(st)
                    et = "\033[0;32m{0}\033[0m".format(et)
                else:
                    st = et = ""
                    if block_node == title_node and authonly:
                        continue
                        headers = block_node.xpath(".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']".format(mean_font_size*1.05, main_font_color))
                        head_txt = ' '.join([etree.tostring(el, method='text', encoding="UTF-8") for el in headers])
                        if len(head_txt):
                            head_txts.append("{0}{1}{2}".format(st, head_txt, et))

            if block_node == title_node and titleonly:
                stop = True
                break
            elif block_node == auth_node and authonly:
                stop = True
                break
                if stop:
                    break
                    for txt in head_txts:
                        sys.stdout.writelines([txt, '\n'])

def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
        try:
            try:
                opts, args = getopt.getopt(argv, "ht", ["help", "test", "noxml", "highlight", "title", "author"])
            except getopt.error as msg:
                raise UsageError(msg)
                for o, a in opts:
                    if (o in ['-h', '--help']):
                        # print help and exit
                        sys.stdout.write(__doc__)
                        sys.stdout.flush()
                        return 0

        pdf2heads(opts, args)

    except UsageError as err:
        print >>sys.stderr, err.msg
        print >>sys.stderr, "for help use --help"
        return 2
    except ConfigError, err:
        sys.stderr.writelines([str(err.msg),'\n'])
        sys.stderr.flush()
        return 1
コード例 #8
0
def pdf2heads(opts, args):
    global Verbose_flag
    xmltag = True
    highlight = False
    titleonly = False
    authonly = False
    Verbose_flag = False
    look_for_all_caps_headings = False
    global automatic_rerunning
    global Found_abstract
    global Found_Sammanfattning

    start_to_exclude = False

    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
        if (o == '--title'):
            titleonly = True
        elif (o == '--author'):
            authonly = True
        elif (o == '--verbose'):
            Verbose_flag = True
            print "Verbose_flag is on"
        elif (o == '--caps'):
            print "looking for ABSTRACT and other headers in all caps"
            look_for_all_caps_headings = True

    if automatic_rerunning:
        print "looking for ABSTRACT and other headers in all caps"
        look_for_all_caps_headings = True

    tree = pdf2etree(args)

    # find title - look on the first page of the document at the first block of text on the page
    page = 1
    block = 1
    title_node = None
    while True:
        try:
            trial_title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
            if Verbose_flag:
                print "trial_title_node:"
                print trial_title_node

#            title_headers = trial_title_node.xpath(".//TOKEN[@font-size > {0}]".format(23))
# note that the Title is assumed to be 20 points or larger in size
            title_headers = trial_title_node.xpath(
                ".//TOKEN[@font-size > {0}]".format(20))
            if Verbose_flag:
                print "title_headers:"
                print title_headers
            title_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in title_headers
            ])
            if len(title_head_txt):
                print "<Title>" + title_head_txt + "</Title>"
                title_node = trial_title_node
                next_block = block + 1
                break
        except IndexError:
            page += 1
        else:
            break
        if page > 2:
            # probably not going to find it now
            break

    # find subtitle - note that a subtitle is option - start on the 2nd page and second block on the page
    page = 2
    block = 2
    next_block = 2
    subtitle_node = None
    while True:
        try:
            trial_subtitle_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
            if Verbose_flag:
                print "trial_subtitle_node:"
                print trial_subtitle_node

# the Subtitle is assumed to be larger than 19 points
            subtitle_headers = trial_subtitle_node.xpath(
                ".//TOKEN[@font-size > {0}]".format(19))
            if Verbose_flag:
                print "subtitle_headers:"
                print subtitle_headers

            if len(subtitle_headers) == 0:
                next_block = 2
                break
            subtitle_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in subtitle_headers
            ])
            if len(subtitle_head_txt):
                subtitle_node = trial_subtitle_node
                print "<Subtitle>" + title_head_txt + "</Subtitle>"
                next_block = 3
                break

        except IndexError:
            block += 1
        else:
            break
        if block > 4:
            # probably not going to find it now
            break

    # find author - on inside cover
    page = 2
    block = next_block
    auth_node = None
    while True:
        try:
            trial_auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
            if Verbose_flag:
                print "trial_auth_node:"
                print trial_auth_node

# the author's name(s) is(are) assumed to be 15 points or larger in size
            auth_headers = trial_auth_node.xpath(
                ".//TOKEN[@font-size > {0}]".format(15))
            if Verbose_flag:
                print "auth_headers:"
                print auth_headers
            auth_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in auth_headers
            ])
            if len(title_head_txt):
                auth_node = trial_auth_node
                break

        except IndexError:
            block += 1
        else:
            break
        if block > 4:
            # probably not going to find it now
            break

    font_sizes = tree.xpath('//TOKEN/@font-size')
    mean_font_size = mean(font_sizes)
    median_font_size = median(font_sizes)

    #    print "Median Font Size (i.e. body text):", median_font_size

    font_colors = tree.xpath('//TOKEN/@font-color')
    font_color_hash = {}
    for fc in font_colors:
        try:
            font_color_hash[fc] += 1
        except KeyError:
            font_color_hash[fc] = 1

    sortlist = [(v, k) for k, v in font_color_hash.iteritems()]
    sortlist.sort(reverse=True)
    main_font_color = sortlist[0][1]
    head_txts = []
    stop = False

    page = 0
    Found_abstract = False
    Found_Sammanfattning = False

    for page_node in tree.xpath('//PAGE'):
        page = page + 1
        block_number = 0
        for block_node in page_node.xpath('.//BLOCK'):
            block_number = block_number + 1
            if xmltag:
                if block_node == title_node:
                    st = "<title>"
                    et = "</title>"
                if block_node == subtitle_node:
                    st = "<subtitle>"
                    et = "</subtitle>"
                elif block_node == auth_node:
                    st = "<author>"
                    et = "</author>"
                else:
                    st = "<heading>"
                    et = "</heading>"

                if highlight:
                    st = "\033[0;32m{0}\033[0m".format(st)
                    et = "\033[0;32m{0}\033[0m".format(et)
            else:
                st = et = ""
            if block_node == title_node and authonly:
                continue
# note that the assumption that the Abstract headings is set in a larger font then the median font sized used on a page, will not find
# abstracts of Aalto university - as they set the word ABSTRACT in a slightly larger size font as used for the rest of the text, but they do set it in all CAPs
            if look_for_all_caps_headings:
                headers = block_node.xpath(
                    ".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']"
                    .format(mean_font_size, main_font_color))
            else:
                headers = block_node.xpath(
                    ".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']"
                    .format(mean_font_size * 1.05, main_font_color))

            head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in headers
            ])
            if head_txt in text_start_to_exclude:
                start_to_exclude = True
            head_txt = filter_headings(head_txt)
            if len(head_txt) and (not start_to_exclude):
                head_txts.append("{0}{1}{2}".format(st, head_txt, et))

            if head_txt.find("Abstract") >= 0 or head_txt.find(
                    "ABSTRACT") >= 0:
                if not Found_abstract:
                    print "Abstract (en):"
                    output_blocks_on_page(page_node, block_number, page)
                    Found_abstract = True
                break

            if head_txt.find("Sammanfattning") >= 0 or head_txt.find(
                    "SAMMANFATTNING") >= 0:
                if not Found_Sammanfattning:
                    print "Sammanfattning (sv):"
                    output_blocks_on_page(page_node, block_number, page)
                    Found_Sammanfattning = True
                break

            if head_txt.find("Abstrakt") >= 0 or head_txt.find(
                    "ABSTRAKT") >= 0:
                if not Found_Sammanfattning:
                    print "Abstrakt (sv):"
                    output_blocks_on_page(page_node, block_number, page)
                    Found_Sammanfattning = True
                break

            if head_txt.find("Referat") >= 0 or head_txt.find("REFERAT") >= 0:
                if not Found_Sammanfattning:
                    print "Referat (sv):"
                    output_blocks_on_page(page_node, block_number, page)
                    Found_Sammanfattning = True
                break


#
#            if head_txt.find("Abstracto(sp)") >= 0:
#                    print "Abstracto (sp):"
#                    output_blocks_on_page(page_node, block_number, page)
#                break
#
#            if head_txt.find("Abstrait (fr)") >= 0:
#                    print "Abstrait (fr):"
#                    output_blocks_on_page(page_node, block_number, page)
#                break

            if block_node == title_node and titleonly:
                stop = True
                break
            elif block_node == auth_node and authonly:
                stop = True
                break
        if stop:
            break
    for txt in head_txts:
        sys.stdout.writelines([txt, '\n'])
コード例 #9
0
def pdf2heads(opts, args, document):
    global Verbose_flag
    global test_flag
    xmltag = True
    highlight = False
    titleonly = False
    authonly = False
    Verbose_flag = False
    test_flag = False
    global look_for_all_caps_headings
    look_for_all_caps_headings = False
    global automatic_rerunning
    global Found_Heading
    global Found_abstract
    global Found_org
    global Found_key
    global Found_Author
    global Found_Level
    global Found_Sammanfattning
    global Found_Method
    global Found_Introduction
    global Found_TOC
    global abstractOut_path
    global OrgandSup_path
    global referat_path
    global methodOut_path
    global introductionOut_path
    global toc_path
    global heading_path
    global title_path
    global author_path
    global subtitle_path
    global end_tag
    global tree
    global mean_font_size
    global main_font_color
    global document_type
    global mean_font_size
    global author
    author = ""

    document_type = document

    start_to_exclude = False

    for o, a in opts:
        if (o == '--noxml'):
            xmltag = False
        elif (o == '--highlight'):
            highlight = True
        if (o == '--title'):
            titleonly = True
        elif (o == '--author'):
            authonly = True
        elif (o == '--unittest'):
            test_flag = True
        elif (o == '--verbose'):
            Verbose_flag = True
            print "Verbose_flag is on"
        elif (o == '--caps'):
            print "looking for ABSTRACT and other headers in all caps"
            look_for_all_caps_headings = True

    if automatic_rerunning:
        print "looking for ABSTRACT and other headers in all caps"
        look_for_all_caps_headings = True

    tree = pdf2etree(args)
    global title_head_txt

    # find title - look on the first page of the document at the first block of text on the page
    page = 1
    block = 1
    title_node = None
    while (page < 2):
        try:
            trial_title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]

            if Verbose_flag:  #verse flag
                print "trial_title_node:"
                print trial_title_node

#            title_headers = trial_title_node.xpath(".//TOKEN[@font-size > {0}]".format(23))
# note that the Title is assumed to be 20 points or larger in size
            title_headers = trial_title_node.xpath(
                ".//TOKEN[(@font-size > {0} and @bold = 'yes') or (@font-size > {1} and @bold = 'yes')]"
                .format(20, 15))

            if Verbose_flag:  #verse flag
                print "title_headers:"
                print title_headers

            title_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in title_headers
            ])

            if len(title_head_txt):  #sucess title found
                print "Title: found"
                title_path = '../../../../output/parse_result/' + directiory + '/title.txt'
                txt = title_head_txt
                st = 'title'
                json_append(st, txt)
                # with open(title_path, 'w') as f:
                #     print txt+ "\n"  # print tag information to certain file
                #     print >> f, txt, "\n"  # print tag information to certain file
                title_node = trial_title_node
                next_block = block + 1
                break
            block = block + 1
        except IndexError:
            page += 1

    # find subtitle - note that a subtitle is option - start on the 2nd page and second block on the page
    # WRONG SECOND PAGE IS TABLE OF CONTENt.
    page = 1
    block = next_block
    print_log("next block is:  " + str(block))
    subtitle_node = None
    while (page < 2):
        try:
            trial_subtitle_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]

            if Verbose_flag:
                print "trial_subtitle_node:"
                print trial_subtitle_node

# the Subtitle is assumed to be larger than 19 points
            subtitle_headers = trial_subtitle_node.xpath(
                ".//TOKEN[(@font-size < {0} and @bold = 'no' and @italic= 'no') or (@font-size > {1} and @bold = 'no' and @italic= 'yes')]"
                .format(20, 13))
            if Verbose_flag:
                print "subtitle_headers:"
                print subtitle_headers
            subtitle_path = '../../../../output/parse_result/' + directiory + '/subtitle.txt'
            title_path = '../../../../output/parse_result/' + directiory + '/title.txt'

            subtitle_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in subtitle_headers
            ])
            if len(subtitle_head_txt) and not subtitle_head_txt.isdigit():
                if title_head_txt == "Project proposal":
                    subtitle_path = title_path
                    print "Subtitle: not found"
                    print "Title: found since title is project proporsal, replace subtitle as title"
                txt = subtitle_head_txt
                st = 'subtitle'
                json_append(st, txt)
                # with open(subtitle_path, 'w') as f:
                #   print txt+ "\n"  # print tag information to certain file
                #
                #   print >> f, txt, "\n"  # print tag information to certain file
                subtitle_node = trial_subtitle_node
                next_block = block + 1
                print "Subtitle: found"
                break

            block = block + 1

        except IndexError:
            page += 1

    # find author - on cover page
    Found_Author = False
    Found_Level = False
    author_path = '../../../../output/parse_result/' + directiory + '/author_detail.txt'
    frontname_path = '../../../../output/parse_result/' + directiory + '/front_name.txt'
    aftername_path = '../../../../output/parse_result/' + directiory + '/after_name.txt'
    page = 1
    block = next_block
    auth_node = None
    auth_count = 0
    while (page < 2):
        try:
            trial_auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(
                page, block))[0]
            if Verbose_flag:
                print "trial_auth_node:"
                print trial_auth_node

# the author's name(s) is(are) assumed to be smaller than title   bigger than   degree project...
            auth_headers = trial_auth_node.xpath(
                ".//TOKEN[@font-size < {0}  and @font-size > {1}]".format(
                    20, 11))
            if Verbose_flag:
                print "auth_headers:"
                print auth_headers
            print_log(document_type)
            auth_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in auth_headers
            ])
            auth_list = auth_head_txt.split(";")

            while (len(auth_head_txt) > 0) and auth_count < 2 and len(
                    auth_list) > auth_count:  #found
                print "Author: found"
                auth_head_txt = auth_list[auth_count - 1]
                auth_count += 1

                name_split = auth_head_txt.split()
                txt = auth_head_txt
                author = author + "_" + auth_head_txt
                author_path = '../../../../output/parse_result/' + directiory + '/author_' + str(
                    auth_count) + '.txt'
                st = 'author_' + str(auth_count)
                json_append(st, txt)
                # with open(author_path, 'w') as f:
                #     print txt + "in" + author_path
                #     print >> f, txt, "\n"  # print tag information to certain file
                txt = name_split[0]

                frontname_path = '../../../../output/parse_result/' + directiory + '/author_' + str(
                    auth_count) + '_frontname' + '.txt'
                st = 'author_' + str(auth_count) + '_frontname'
                json_append(st, txt)

                # with open(frontname_path, 'w') as f:
                #     print txt + "in" + frontname_path
                #     print >> f, txt, "\n"  # print tag information to certain file

                txt = name_split[1]

                aftername_path = '../../../../output/parse_result/' + directiory + '/author_' + str(
                    auth_count) + '_aftername' + '.txt'

                st = 'author_' + str(auth_count) + '_aftername'
                json_append(st, txt)

                # with open(aftername_path, 'w') as f:
                #     print txt + "in" + aftername_path
                #     print >> f, txt, "\n"  # print tag information to certain file
                auth_node = trial_auth_node

            block = block + 1
        except IndexError:
            page += 1

    font_sizes = tree.xpath('//TOKEN/@font-size')
    mean_font_size = mean(font_sizes)
    median_font_size = median(font_sizes)

    #    print "Median Font Size (i.e. body text):", median_font_size

    font_colors = tree.xpath('//TOKEN/@font-color')
    font_color_hash = {}
    for fc in font_colors:
        try:
            font_color_hash[fc] += 1
        except KeyError:
            font_color_hash[fc] = 1

    sortlist = [(v, k) for k, v in font_color_hash.iteritems()]
    sortlist.sort(reverse=True)
    main_font_color = sortlist[0][1]
    head_txts = []
    stop = False

    page = 0
    Found_abstract = False
    Found_org = False
    Found_key = False
    Found_Sammanfattning = False
    Found_Method = False
    Found_Introduction = False
    Found_TOC = False
    OrgandSup_path = '../../../../output/parse_result/' + directiory + '/Orignization_supervisor(en).txt'
    key_path = '../../../../output/parse_result/' + directiory + '/Keyword(en).txt'
    abstractOut_path = '../../../../output/parse_result/' + directiory + '/abstract(en).txt'
    abstractsvOut_path = '../../../../output/parse_result/' + directiory + '/abstract(sv).txt'
    referat_path = '../../../../output/parse_result/' + directiory + '/referat(sv).txt'
    methodOut_path = '../../../../output/parse_result/' + directiory + '/method(en).txt'
    toc_path = '../../../../output/parse_result/' + directiory + '/toc(en).txt'
    introductionOut_path = '../../../../output/parse_result/' + directiory + '/introduction(en).txt'
    heading_path = '../../../../output/parse_result/' + directiory + '/heading.txt'
    title_path = '../../../../output/parse_result/' + directiory + '/title.txt'

    #page node
    for page_node in tree.xpath('//PAGE'):
        page = page + 1
        block_number = 0
        for block_node in page_node.xpath('.//BLOCK'):
            block_number = block_number + 1
            if xmltag:
                #specify data mining model
                #all gone to heading....not working!!

                if block_node == title_node:  #found title
                    st = "title"
                    et = "title"
                if block_node == subtitle_node:  #found subtitle
                    st = "subtitle"
                    et = "subtitle"
                elif block_node == auth_node:  #found author #not working
                    st = "author"
                    et = "author"
                else:
                    st = "heading"
                    et = "heading"  #found other headings

                if highlight:
                    st = "\033[0;32m{0}\033[0m".format(st)
                    et = "\033[0;32m{0}\033[0m".format(et)
            else:
                st = et = ""
            if block_node == title_node and authonly:
                continue
# note that the assumption that the Abstract headings is set in a larger font then the median font sized used on a page, will not find
# abstracts of Aalto university - as they set the word ABSTRACT in a slightly larger size font as used for the rest of the text, but they do set it in all CAPs
            if look_for_all_caps_headings:
                headers = block_node.xpath(
                    ".//TOKEN[(@font-size > {0} and @bold = 'yes') or @font-color != '{1}']"
                    .format(mean_font_size, main_font_color))
            else:
                headers = block_node.xpath(
                    ".//TOKEN[(@font-size > {0} and @bold = 'yes') or @font-color != '{1}']"
                    .format(mean_font_size * 1.05, main_font_color))
            level_headers = block_node.xpath(
                ".//TOKEN[@font-size > {0}]".format(0))

            head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in headers
            ])
            level_head_txt = ' '.join([
                etree.tostring(el, method='text', encoding="UTF-8")
                for el in level_headers
            ])

            # print head_txt
            if head_txt in text_start_to_exclude:
                start_to_exclude = True
            head_txt = filter_headings(head_txt)

            if len(head_txt) and (not start_to_exclude):
                head_txts.append("{0}{1}{2}".format(
                    st, head_txt, et))  #append st tag_content andet

                # model for proposal
            if (int(document_type) == 1):
                print_log("first content check: " + head_txt)
                if head_txt.find("Authors") >= 0 or head_txt.find(
                        "Author") >= 0:
                    if not Found_Author:  # if the abstract has not been found yet
                        print "Authors(en): OVERIDE "
                        print "Authors and detail information (en): found "
                        author = ""
                        output_text_on_block_on_page(page_node, block_number,
                                                     page, author_path)
                        author = auth
                        Found_Author = True

                if level_head_txt.find("Bachelor") >= 0 or level_head_txt.find(
                        "Master") >= 0 or level_head_txt.find(
                            "Degree Project") >= 0:
                    if not Found_Level:  # if the abstract has not been found yet
                        print_log("Level: found")
                        level_path = '../../../../output/parse_result/' + directiory + '/level.txt'
                        st = 'level'
                        json_append(st, level_head_txt)
                        # with open(level_path, 'w') as f:
                        #     print level_head_txt + "\n"  # print tag information to certain file
                        #     print >> f, level_head_txt, "\n"  # print tag information to certain file

                        Found_Level = True

                if head_txt.find("Organization and Supervisor") >= 0 or (
                        head_txt.find("Organization") >= 0
                        and head_txt.find("Supervisor") >= 0):
                    if not Found_org:  # if the abstract has not been found yet
                        print "Organization and Supervisor (en): found"
                        output_blocks_on_page(page_node, block_number, page,
                                              OrgandSup_path, 0)
                        Found_org = True

                if head_txt.find("Keywords") >= 0 or head_txt.find(
                        "Keyword") >= 0:
                    print_log("I should be herer!!!!!")
                    if not Found_key:  # if the abstract has not been found yet
                        print "Keywords(en): found"
                        output_blocks_on_page(page_node, block_number, page,
                                              key_path, 0)
                        Found_key = True

                # model for thesis
            if head_txt.find("Abstract") >= 0 or head_txt.find(
                    "ABSTRACT") >= 0:
                if not Found_abstract:  #if the abstract has not been found yet
                    print "Abstract (en): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          abstractOut_path, 0)
                    Found_abstract = True
                break

            if head_txt.find("Sammanfattning") >= 0 or head_txt.find(
                    "SAMMANFATTNING") >= 0:
                if not Found_Sammanfattning:
                    print "Sammanfattning (sv): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          abstractsvOut_path, 0)
                    Found_Sammanfattning = True
                break

            if head_txt.find("Abstrakt") >= 0 or head_txt.find(
                    "ABSTRAKT") >= 0:
                if not Found_Sammanfattning:
                    print "Abstrakt (sv): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          abstractOut_path, 0)
                    Found_Sammanfattning = True
                break

            if head_txt.find("Referat") >= 0 or head_txt.find("REFERAT") >= 0:
                if not Found_Sammanfattning:
                    print "Referat (sv): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          referat_path, 0)
                    Found_Sammanfattning = True
                break
                #table of content
            if head_txt.find("Table of Contents") >= 0 or head_txt.find(
                    "Contents") >= 0:
                if not Found_TOC:  # if the abstract has not been found yet
                    print "TOC (en): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          toc_path, 0)
                    Found_TOC = True
                break

            if head_txt.find("Introduction") >= 0 or head_txt.find(
                    "INTRODUCTION") >= 0:
                if not Found_Introduction:  # if the abstract has not been found yet
                    print "Introduction (en): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          introductionOut_path, 1)
                    Found_Introduction = True

                    #Found_Introduction = True
                break

            if head_txt.find("Methods") >= 0 or head_txt.find(
                    "METHODS") >= 0 or head_txt.find(
                        "Methodology") >= 0 or head_txt.find(
                            "METHODOLOGY") >= 0:
                if not Found_Method:  #if the abstract has not been found yet
                    print "Methods (en): found"
                    output_blocks_on_page(page_node, block_number, page,
                                          methodOut_path, 0)
                    Found_Method = True
                break


#
#            if head_txt.find("Abstracto(sp)") >= 0:
#                    print "Abstracto (sp):"
#                    output_blocks_on_page(page_node, block_number, page)
#                break
#
#            if head_txt.find("Abstrait (fr)") >= 0:
#                    print "Abstrait (fr):"
#                    output_blocks_on_page(page_node, block_number, page)
#                break

            if block_node == title_node and titleonly:
                stop = True
                break
            elif block_node == auth_node and authonly:
                stop = True
                break
        if stop:
            break