def createFromScrapedDefinitions(): common_logger.info("Creating AcronymDB") csv.field_size_limit(sys.maxint) acronymDB = {} loaded_acronyms = 0 for definition_file in file_scraped_definitions_list: # open as csv file with headers acronym_csv = csv.DictReader( open(definition_file, "rb"), delimiter=",") for row in acronym_csv: acronym = toUnicode(row["acronym"]) acronym_expansion = toUnicode(row["acronym_expansion"]) article_id = toUnicode(row["article_id"]) if(acronym not in acronymDB): acronymDB[acronym] = [] acronymDB[acronym].append([acronym_expansion .strip().lower().replace('-', ' '), article_id]) # , row["article_title"]]) # title was part of old format loaded_acronyms += 1 if(loaded_acronyms % 10000 == 0): common_logger.debug("loaded %d acronyms", loaded_acronyms) common_logger.info("adding def_count values to acronymDB") defs_per_acronym = [0] * 1000 insts_per_def = [0] * 1000 #num_acronyms = len(acronymDB) for acronym, values_for_this_acronym in acronymDB.items(): values_for_this_acronym = sorted( values_for_this_acronym, key=lambda x: x[0]) def_count = 0 inst_count = 0 expansion_of_last_acronym = values_for_this_acronym[0][0] #, article_title]\ # title was part of old format in the line below for index, [acronym_expansion, article_id]\ in enumerate(values_for_this_acronym): if AcronymExpansion.startsSameWay(acronym_expansion, expansion_of_last_acronym): inst_count += 1 values_for_this_acronym[index].append(def_count) values_for_this_acronym[index][0] = expansion_of_last_acronym else: insts_per_def[min(inst_count, len(insts_per_def) - 1)] += 1 inst_count = 0 def_count += 1 expansion_of_last_acronym = acronym_expansion values_for_this_acronym[index].append(def_count) defs_per_acronym[min(def_count, len(defs_per_acronym) - 1)] += 1 acronymDB[acronym] = numpy.array(values_for_this_acronym) dump(acronymDB) common_logger.info("Dumped AcronymDB successfully")
def get_text(self, file_path): if file_path[-4:] == '.txt': return open(file_path).read() txt_path = file_path if file_path[-4:] == '.pdf': txt_path = file_path[:-4] + '.txt' elif file_path[-4:] != '.txt': txt_path = file_path + '.txt' # if (os.file_path.isfile(txt_path)): # return open(txt_path).read() if file_path[-4:] != '.pdf': file_path = file_path + '.pdf' rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(file_path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() result = toUnicode(retstr.getvalue()) retstr.close() return result
def createFromScrapedArticles(): common_logger.info("Creating ArticleDB") csv.field_size_limit(sys.maxint) articleDB = {} loaded_articles = 0 for article_file in file_scraped_articles_list: # open as csv file with headers article_csv = csv.DictReader(open(article_file, "rb"), delimiter=",") for row in article_csv: article_id = toUnicode(row["article_id"]) articleDB[article_id] = toUnicode(row["article_text"]) loaded_articles += 1 if(loaded_articles % 10000 == 0): common_logger.debug("loaded %d articles", loaded_articles) dump(articleDB, path=file_articledb) common_logger.info("Dumped ArticleDB successfully")