Exemple #1
0
def applyManualCorrections(acronymDB):
    for line in DictReader(open(file_msh_manual_corrections, "rb"), delimiter=","):
        acronym = TextTools.toUnicode(line["acronym"])
        wrong_exp = TextTools.toUnicode(line["wrong_expansion"])
        correct_exp = TextTools.toUnicode(line["correct_expansion"])

        for entry in acronymDB[acronym]:
            if entry[0] == wrong_exp:
                entry[0] = correct_exp

    return acronymDB
Exemple #2
0
def _createArticleAndAcronymDB():
    acronymExpander = Expander_fromText_v2()
    articleDB = {}
    acronymDB = {}
    CUID_to_expansion = {}
    for fileName in os.listdir(folder_msh_arff):
        filePath = os.path.join(folder_msh_arff, fileName)
        file_reader = arff.Reader(open(filePath, "rb"))
        # the iterator needs to be called for the self.relation part to be
        # initialized
        lines = list(file_reader)
        cuids = file_reader.relation.strip().split("_")
        # storing all acronyms as uppercase values
        acronym = _fileNameToAcronym(fileName).upper()
        cuid_and_pmid = []
        for line in lines:
            pmid = unicode(line.PMID)
            text = TextTools.toUnicode(line.citation)
            cuid = cuids[_classToIndex(line["class"])]
            textWithoutMarkup = _removeMarkup(text)
            if (cuid not in CUID_to_expansion):
                acronymExpansions = []
                acronymExpansions = acronymExpander.expand(
                    acronym, acronymExpansions, textWithoutMarkup)
                if (len(acronymExpansions) != 0 and
                        acronymExpansions[0].expansion != acronym):
                    CUID_to_expansion[cuid] = acronymExpansions[0].expansion
            if (pmid not in articleDB):
                articleDB[pmid] = textWithoutMarkup
            cuid_and_pmid.append([cuid, pmid])

        if (acronym in acronymDB):
            common_logger.error("acronym already present in acronymDB")
        else:
            acronymDB[acronym] = []
        for cuid, pmid in cuid_and_pmid:
            if (cuid in CUID_to_expansion):
                acronymDB[acronym].append([CUID_to_expansion[cuid], pmid, 0])
            else:
                common_logger.error(
                    "Expansion not found for CUID %s of %s" % (cuid, acronym))
                acronymDB[acronym].append([cuid, pmid, 0])

    return acronymDB, articleDB