def _processChoices(self, acronym_expansions): """ input: list(acronym expansion strings) returns: y_labels (list): of integer labels assigned to acronym expansions labelToExpansion (dict): to convert label number to acronym expansion """ y_labels = [] labelToExpansion = {} if(len(acronym_expansions) == 0): return y_labels, labelToExpansion y_labels = [index for index in range(len(acronym_expansions))] labelToExpansion[0] = acronym_expansions[0] for indexAhead in range(1, len(acronym_expansions)): new_expansion = acronym_expansions[indexAhead] newIsUnique = True # check if new_expansion is same as a previous expansion # if same assign previous label and move on for label, expansion in labelToExpansion.items(): if(AcronymExpansion.areExpansionsSimilar(expansion, new_expansion)): newIsUnique = False y_labels[indexAhead] = label break # if label is new indeed, then give it a label ID (integer) and # make an entry in the labelToExpansion dictionary if(newIsUnique): new_class_label = len(labelToExpansion) labelToExpansion[new_class_label] = new_expansion y_labels[indexAhead] = new_class_label return y_labels, labelToExpansion
def createFromScrapedDefinitions(): common_logger.info("Creating AcronymDB") csv.field_size_limit(sys.maxint) acronymDB = {} loaded_acronyms = 0 for definition_file in file_scraped_definitions_list: # open as csv file with headers acronym_csv = csv.DictReader( open(definition_file, "rb"), delimiter=",") for row in acronym_csv: acronym = toUnicode(row["acronym"]) acronym_expansion = toUnicode(row["acronym_expansion"]) article_id = toUnicode(row["article_id"]) if(acronym not in acronymDB): acronymDB[acronym] = [] acronymDB[acronym].append([acronym_expansion .strip().lower().replace('-', ' '), article_id]) # , row["article_title"]]) # title was part of old format loaded_acronyms += 1 if(loaded_acronyms % 10000 == 0): common_logger.debug("loaded %d acronyms", loaded_acronyms) common_logger.info("adding def_count values to acronymDB") defs_per_acronym = [0] * 1000 insts_per_def = [0] * 1000 #num_acronyms = len(acronymDB) for acronym, values_for_this_acronym in acronymDB.items(): values_for_this_acronym = sorted( values_for_this_acronym, key=lambda x: x[0]) def_count = 0 inst_count = 0 expansion_of_last_acronym = values_for_this_acronym[0][0] #, article_title]\ # title was part of old format in the line below for index, [acronym_expansion, article_id]\ in enumerate(values_for_this_acronym): if AcronymExpansion.startsSameWay(acronym_expansion, expansion_of_last_acronym): inst_count += 1 values_for_this_acronym[index].append(def_count) values_for_this_acronym[index][0] = expansion_of_last_acronym else: insts_per_def[min(inst_count, len(insts_per_def) - 1)] += 1 inst_count = 0 def_count += 1 expansion_of_last_acronym = acronym_expansion values_for_this_acronym[index].append(def_count) defs_per_acronym[min(def_count, len(defs_per_acronym) - 1)] += 1 acronymDB[acronym] = numpy.array(values_for_this_acronym) dump(acronymDB) common_logger.info("Dumped AcronymDB successfully")