Python PDFTableLoader 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: TableLoader

클래스/타입: PDFTableLoader

hotexamples.com에서의 예제들: 2

Python PDFTableLoader - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 TableLoader.PDFTableLoader에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

PDFTableLoader(2)

createTable(2)

loadUnprocessed(2)

예제 #1

파일 보기

def addDirectionalityToPdfTables():

    loader = GeneTableLoader()
    geneTable = loader.createTable("geneDB", loader.loadUnprocessed("our_files/genes.col"))


    pdfLoader = PDFTableLoader()

    # handling table 2 (not from supplementary)
    result = pdfLoader.loadUnprocessed("./parsed_files/2.table")

    # match for each entry in the in s5 table a strand
    for entry in result:

        id, data = geneTable.findByField(Table.NAME_FIELD, entry["name"])

        strand = None

        # if there is no data look in the old names
        if (id == None):
            id, data = geneTable.findByOtherNames(entry["name"])

        if (data != None):
            strand = id.split(Table.ID_DELIMITER)[2]

        entry[TableGlobals.FIRST_STRAND_KEY] = strand
        entry[TableGlobals.SECOND_STRAND_KEY] = strand

    pdfTable = pdfLoader.createTable("dummy", result)
    pdfTable.dump("./final_format/2_directed.table")

    # S5 handling
    result = pdfLoader.loadUnprocessed("./parsed_files/s5.table")

    # match for each entry in the in s5 table a strand
    for entry in result:

        id, data = geneTable.findByField(Table.NAME_FIELD, entry["name"])

        strand = None

        # if there is no data look in the old names
        if (id == None):
            id, data = geneTable.findByOtherNames(entry["name"])

        if (data != None):
            strand = id.split(Table.ID_DELIMITER)[2]

        entry[TableGlobals.FIRST_STRAND_KEY] = strand
        entry[TableGlobals.SECOND_STRAND_KEY] = strand

    pdfTable = pdfLoader.createTable("dummy", result)
    pdfTable.dump("./final_format/s5_directed.table")

    # S8 handling
    result = pdfLoader.loadUnprocessed("./parsed_files/s8.table")

    # match for each entry in the in s5 table a strand
    for entry in result:

        id, data = geneTable.findByField(Table.NAME_FIELD, entry["name"])

        strand = None

        # if there is no data look in the old names
        if (id == None):
            id, data = geneTable.findByOtherNames(entry["name"])

        if (data != None):
            print "[warning] using by old name"
            strand = id.split(Table.ID_DELIMITER)[2]

        entry[TableGlobals.FIRST_STRAND_KEY] = strand
        entry[TableGlobals.SECOND_STRAND_KEY] = strand

    pdfTable = pdfLoader.createTable("dummy", result)
    pdfTable.dump("./final_format/s8_directed.table")

    #S7 handling
    result = pdfLoader.loadUnprocessed("./parsed_files/s7.table")

    for entry in result:

        strand = entry.pop("strand")

        if (strand == "f"):
            strand = Table.POS_STRAND

        elif (strand == "r"):
            strand = Table.NEG_STRAND

        entry[TableGlobals.FIRST_STRAND_KEY] = strand
        entry[TableGlobals.SECOND_STRAND_KEY] = entry[TableGlobals.FIRST_STRAND_KEY]

    pdfTable = pdfLoader.createTable("dummy", result)
    pdfTable.dump("./final_format/s7_directed.table")

    # S6 handling
    result = pdfLoader.loadUnprocessed("./parsed_files/s6.table")
    pdfLoader.createTable("dummy", result).dump("./final_format/s6_directed.table")

예제 #2

파일 보기

def printMatchAnalysis():
    """
    This function is used to print the matched directions for the know rna from
    the pdf so we can make sure the direction we matched them is valid and there
    was no problems with the name (since there are new and old ones and the article
    is from 2012).

    :return: None
    """

    loader = GeneTableLoader()
    result = loader.loadUnprocessed("genes.col")

    table = loader.createTable("geneDB", result)

    count = {}

    for entry in result:

        name = entry["NAME".lower()].lower()

        if not count.has_key(name):
            count[name] = 1
        else:
            count[name] += 1

        if count[name] > 1:
            print "warning"

    old_count = {}

    warnings = ""
    old_warnings = ""

    for entry in result:

        for name in entry["other_names"]:

            lower_name = name.lower()

            if count.has_key(lower_name):
                count[lower_name] += 1
                warnings += ("warning: %s is an old name of %s = %d\n" % (lower_name, entry["NAME".lower()], count[lower_name]))
            elif (old_count.has_key(lower_name)):
                old_count[lower_name] += 1
                old_warnings += ("warning: old multiplicity for %s = %d\n" % (lower_name, old_count[lower_name]))
            else:
                old_count[lower_name] = 1

    print "-" * 100
    print "counting instances of major names and old names"
    print "-" * 100
    print warnings
    print "-" * 100
    print "counting instances of old names between themselves"
    print "-" * 100
    print old_warnings
    print "-" * 100

    # matches = table.is_overlaps(510860, 510863, "+")
    #
    # for match in matches:
    #     res, id = match
    #     print res, table.findById(id)
    #
    # print table.findByName("ybaS")

    pdfLoader = PDFTableLoader()

    result = pdfLoader.loadUnprocessed("./parsed_files/s5.table")
    pdfTable = pdfLoader.createTable("s5", result)

    print "-" * 100
    print "print matches to majors"
    print "-" * 100


    other_name_matches = ""

    total_count = {}
    total_count.update(count)
    total_count.update(old_count)


    #table_name = "srna-name"
    table_name = "name"

    # find matching candidates to copy their directionality
    for entry in result:

        match = table.findByField("name", entry[table_name])


        if (match != (None, None)):

            first_entry_start, first_entry_end, dummy_a,\
                second_entry_start, second_entry_end, dummy_b = \
                pdfTable.findByField(table_name, entry[table_name])[0].split(";")

            startDiff = int(first_entry_start)
            endDiff = int(first_entry_end)

            first_entry_start, first_entry_end, dummy_a,\
                second_entry_start, second_entry_end, dummy_b = match[0].split(";")

            startDiff -= int (int(first_entry_start))
            endDiff -= int(first_entry_end)

            print "%s from PDF match to: %s diff is: %d;%d" % (entry[table_name], match[0], startDiff, endDiff)

            if(total_count[entry[table_name].lower()] > 1):
                print "%s has multiple instances" % entry[table_name]
        else:

            match = table.findByOtherNames(entry[table_name])

            other_name_matches += "%s match to other names: %s\n" % (entry[table_name], match[0])

            if (match[0] != None):
                first_entry_start, first_entry_end, dummy_a,\
                    second_entry_start, second_entry_end, dummy_b = \
                    pdfTable.findByField(table_name, entry[table_name])[0].split(";")

                startDiff = int(first_entry_start)
                endDiff = int(first_entry_end)

                first_entry_start, first_entry_end, dummy_a,\
                    second_entry_start, second_entry_end, dummy_b = match[0].split(";")

                startDiff -= int (int(first_entry_start))
                endDiff -= int(first_entry_end)

                other_name_matches += "the diff is %d;%d\n" % (startDiff, endDiff)

            if (total_count.has_key(entry[table_name])):

                other_name_matches += "%s appears %d in countings\n" % \
                                      (entry[table_name], total_count[entry[table_name]])

    print "-" * 100
    print "matches to old names (in case of no match to major)"
    print "-" * 100
    print other_name_matches