def applyManualCorrections(acronymDB): for line in DictReader(open(file_msh_manual_corrections, "rb"), delimiter=","): acronym = TextTools.toUnicode(line["acronym"]) wrong_exp = TextTools.toUnicode(line["wrong_expansion"]) correct_exp = TextTools.toUnicode(line["correct_expansion"]) for entry in acronymDB[acronym]: if entry[0] == wrong_exp: entry[0] = correct_exp return acronymDB
def _createArticleAndAcronymDB(): acronymExpander = Expander_fromText_v2() articleDB = {} acronymDB = {} CUID_to_expansion = {} for fileName in os.listdir(folder_msh_arff): filePath = os.path.join(folder_msh_arff, fileName) file_reader = arff.Reader(open(filePath, "rb")) # the iterator needs to be called for the self.relation part to be # initialized lines = list(file_reader) cuids = file_reader.relation.strip().split("_") # storing all acronyms as uppercase values acronym = _fileNameToAcronym(fileName).upper() cuid_and_pmid = [] for line in lines: pmid = unicode(line.PMID) text = TextTools.toUnicode(line.citation) cuid = cuids[_classToIndex(line["class"])] textWithoutMarkup = _removeMarkup(text) if (cuid not in CUID_to_expansion): acronymExpansions = [] acronymExpansions = acronymExpander.expand( acronym, acronymExpansions, textWithoutMarkup) if (len(acronymExpansions) != 0 and acronymExpansions[0].expansion != acronym): CUID_to_expansion[cuid] = acronymExpansions[0].expansion if (pmid not in articleDB): articleDB[pmid] = textWithoutMarkup cuid_and_pmid.append([cuid, pmid]) if (acronym in acronymDB): common_logger.error("acronym already present in acronymDB") else: acronymDB[acronym] = [] for cuid, pmid in cuid_and_pmid: if (cuid in CUID_to_expansion): acronymDB[acronym].append([CUID_to_expansion[cuid], pmid, 0]) else: common_logger.error( "Expansion not found for CUID %s of %s" % (cuid, acronym)) acronymDB[acronym].append([cuid, pmid, 0]) return acronymDB, articleDB