예제 #1
0
def addDirectionalityToPdfTables():

    loader = GeneTableLoader()
    geneTable = loader.createTable("geneDB", loader.loadUnprocessed("our_files/genes.col"))


    pdfLoader = PDFTableLoader()

    # handling table 2 (not from supplementary)
    result = pdfLoader.loadUnprocessed("./parsed_files/2.table")

    # match for each entry in the in s5 table a strand
    for entry in result:

        id, data = geneTable.findByField(Table.NAME_FIELD, entry["name"])

        strand = None

        # if there is no data look in the old names
        if (id == None):
            id, data = geneTable.findByOtherNames(entry["name"])

        if (data != None):
            strand = id.split(Table.ID_DELIMITER)[2]

        entry[TableGlobals.FIRST_STRAND_KEY] = strand
        entry[TableGlobals.SECOND_STRAND_KEY] = strand

    pdfTable = pdfLoader.createTable("dummy", result)
    pdfTable.dump("./final_format/2_directed.table")

    # S5 handling
    result = pdfLoader.loadUnprocessed("./parsed_files/s5.table")

    # match for each entry in the in s5 table a strand
    for entry in result:

        id, data = geneTable.findByField(Table.NAME_FIELD, entry["name"])

        strand = None

        # if there is no data look in the old names
        if (id == None):
            id, data = geneTable.findByOtherNames(entry["name"])

        if (data != None):
            strand = id.split(Table.ID_DELIMITER)[2]

        entry[TableGlobals.FIRST_STRAND_KEY] = strand
        entry[TableGlobals.SECOND_STRAND_KEY] = strand

    pdfTable = pdfLoader.createTable("dummy", result)
    pdfTable.dump("./final_format/s5_directed.table")

    # S8 handling
    result = pdfLoader.loadUnprocessed("./parsed_files/s8.table")

    # match for each entry in the in s5 table a strand
    for entry in result:

        id, data = geneTable.findByField(Table.NAME_FIELD, entry["name"])

        strand = None

        # if there is no data look in the old names
        if (id == None):
            id, data = geneTable.findByOtherNames(entry["name"])

        if (data != None):
            print "[warning] using by old name"
            strand = id.split(Table.ID_DELIMITER)[2]

        entry[TableGlobals.FIRST_STRAND_KEY] = strand
        entry[TableGlobals.SECOND_STRAND_KEY] = strand

    pdfTable = pdfLoader.createTable("dummy", result)
    pdfTable.dump("./final_format/s8_directed.table")

    #S7 handling
    result = pdfLoader.loadUnprocessed("./parsed_files/s7.table")

    for entry in result:

        strand = entry.pop("strand")

        if (strand == "f"):
            strand = Table.POS_STRAND

        elif (strand == "r"):
            strand = Table.NEG_STRAND

        entry[TableGlobals.FIRST_STRAND_KEY] = strand
        entry[TableGlobals.SECOND_STRAND_KEY] = entry[TableGlobals.FIRST_STRAND_KEY]

    pdfTable = pdfLoader.createTable("dummy", result)
    pdfTable.dump("./final_format/s7_directed.table")

    # S6 handling
    result = pdfLoader.loadUnprocessed("./parsed_files/s6.table")
    pdfLoader.createTable("dummy", result).dump("./final_format/s6_directed.table")
예제 #2
0
def printMatchAnalysis():
    """
    This function is used to print the matched directions for the know rna from
    the pdf so we can make sure the direction we matched them is valid and there
    was no problems with the name (since there are new and old ones and the article
    is from 2012).

    :return: None
    """

    loader = GeneTableLoader()
    result = loader.loadUnprocessed("genes.col")

    table = loader.createTable("geneDB", result)

    count = {}

    for entry in result:

        name = entry["NAME".lower()].lower()

        if not count.has_key(name):
            count[name] = 1
        else:
            count[name] += 1

        if count[name] > 1:
            print "warning"

    old_count = {}

    warnings = ""
    old_warnings = ""

    for entry in result:

        for name in entry["other_names"]:

            lower_name = name.lower()

            if count.has_key(lower_name):
                count[lower_name] += 1
                warnings += ("warning: %s is an old name of %s = %d\n" % (lower_name, entry["NAME".lower()], count[lower_name]))
            elif (old_count.has_key(lower_name)):
                old_count[lower_name] += 1
                old_warnings += ("warning: old multiplicity for %s = %d\n" % (lower_name, old_count[lower_name]))
            else:
                old_count[lower_name] = 1

    print "-" * 100
    print "counting instances of major names and old names"
    print "-" * 100
    print warnings
    print "-" * 100
    print "counting instances of old names between themselves"
    print "-" * 100
    print old_warnings
    print "-" * 100

    # matches = table.is_overlaps(510860, 510863, "+")
    #
    # for match in matches:
    #     res, id = match
    #     print res, table.findById(id)
    #
    # print table.findByName("ybaS")

    pdfLoader = PDFTableLoader()

    result = pdfLoader.loadUnprocessed("./parsed_files/s5.table")
    pdfTable = pdfLoader.createTable("s5", result)

    print "-" * 100
    print "print matches to majors"
    print "-" * 100


    other_name_matches = ""

    total_count = {}
    total_count.update(count)
    total_count.update(old_count)


    #table_name = "srna-name"
    table_name = "name"

    # find matching candidates to copy their directionality
    for entry in result:

        match = table.findByField("name", entry[table_name])


        if (match != (None, None)):

            first_entry_start, first_entry_end, dummy_a,\
                second_entry_start, second_entry_end, dummy_b = \
                pdfTable.findByField(table_name, entry[table_name])[0].split(";")

            startDiff = int(first_entry_start)
            endDiff = int(first_entry_end)

            first_entry_start, first_entry_end, dummy_a,\
                second_entry_start, second_entry_end, dummy_b = match[0].split(";")

            startDiff -= int (int(first_entry_start))
            endDiff -= int(first_entry_end)

            print "%s from PDF match to: %s diff is: %d;%d" % (entry[table_name], match[0], startDiff, endDiff)

            if(total_count[entry[table_name].lower()] > 1):
                print "%s has multiple instances" % entry[table_name]
        else:

            match = table.findByOtherNames(entry[table_name])

            other_name_matches += "%s match to other names: %s\n" % (entry[table_name], match[0])

            if (match[0] != None):
                first_entry_start, first_entry_end, dummy_a,\
                    second_entry_start, second_entry_end, dummy_b = \
                    pdfTable.findByField(table_name, entry[table_name])[0].split(";")

                startDiff = int(first_entry_start)
                endDiff = int(first_entry_end)

                first_entry_start, first_entry_end, dummy_a,\
                    second_entry_start, second_entry_end, dummy_b = match[0].split(";")

                startDiff -= int (int(first_entry_start))
                endDiff -= int(first_entry_end)

                other_name_matches += "the diff is %d;%d\n" % (startDiff, endDiff)

            if (total_count.has_key(entry[table_name])):

                other_name_matches += "%s appears %d in countings\n" % \
                                      (entry[table_name], total_count[entry[table_name]])

    print "-" * 100
    print "matches to old names (in case of no match to major)"
    print "-" * 100
    print other_name_matches