def serialGetWordCorpus(articleDB): word_corpus = {} for article_id, text in articleDB.items(): word_corpus[article_id] = TextTools.getCleanedWords(text, stem_words=stem_words, removeNumbers=removeNumbers) if len(word_corpus) % 1000 == 0: common_logger.debug("converted " + str(len(word_corpus)) + " articles to words") return word_corpus
def load(path=file_acronymdb): """ acronymdb is a dictionary in the format: (acronym: [list of [acronym_expansion, article_id]]) """ common_logger.debug("loading acronymDB from %s" % path) return pickle.load(open(path, "rb"))
def createFromScrapedDefinitions(): common_logger.info("Creating AcronymDB") csv.field_size_limit(sys.maxint) acronymDB = {} loaded_acronyms = 0 for definition_file in file_scraped_definitions_list: # open as csv file with headers acronym_csv = csv.DictReader( open(definition_file, "rb"), delimiter=",") for row in acronym_csv: acronym = toUnicode(row["acronym"]) acronym_expansion = toUnicode(row["acronym_expansion"]) article_id = toUnicode(row["article_id"]) if(acronym not in acronymDB): acronymDB[acronym] = [] acronymDB[acronym].append([acronym_expansion .strip().lower().replace('-', ' '), article_id]) # , row["article_title"]]) # title was part of old format loaded_acronyms += 1 if(loaded_acronyms % 10000 == 0): common_logger.debug("loaded %d acronyms", loaded_acronyms) common_logger.info("adding def_count values to acronymDB") defs_per_acronym = [0] * 1000 insts_per_def = [0] * 1000 #num_acronyms = len(acronymDB) for acronym, values_for_this_acronym in acronymDB.items(): values_for_this_acronym = sorted( values_for_this_acronym, key=lambda x: x[0]) def_count = 0 inst_count = 0 expansion_of_last_acronym = values_for_this_acronym[0][0] #, article_title]\ # title was part of old format in the line below for index, [acronym_expansion, article_id]\ in enumerate(values_for_this_acronym): if AcronymExpansion.startsSameWay(acronym_expansion, expansion_of_last_acronym): inst_count += 1 values_for_this_acronym[index].append(def_count) values_for_this_acronym[index][0] = expansion_of_last_acronym else: insts_per_def[min(inst_count, len(insts_per_def) - 1)] += 1 inst_count = 0 def_count += 1 expansion_of_last_acronym = acronym_expansion values_for_this_acronym[index].append(def_count) defs_per_acronym[min(def_count, len(defs_per_acronym) - 1)] += 1 acronymDB[acronym] = numpy.array(values_for_this_acronym) dump(acronymDB) common_logger.info("Dumped AcronymDB successfully")
def createArticleIdToLdaDict(word_corpus, dictionary, lda_model): common_logger.info("Creating article_id -> lda_vector dictionary") article_lda = {} index = 0 for article_id in word_corpus.keys(): bow = dictionary.doc2bow(word_corpus[article_id]) lda_vec = lda_model[bow] article_lda[article_id] = lda_vec index += 1 if index % 1000 == 0: common_logger.debug("done with %d articles", index) common_logger.info("saving article_id -> lda_vector dictionary") pickle.dump(article_lda, open(file_lda_articleIDToLDA, "wb"), protocol=2) return article_lda
def downloadPdfs(): with open(file_ScienceWise_index_train, "r") as file: reader = csv.DictReader(file, delimiter=",") for line in reader: pdfID = line["ARXIV_ID"] filename = _arxivIDToFilename(pdfID) try: if(os.path.exists(folder_scienceWise_pdfs + filename)): common_logger.debug("present already " + pdfID) continue _downloadPdf(pdfID) common_logger.debug("successfully downloaded " + pdfID) time.sleep(5 * 60) except: common_logger.exception("Error in file " + pdfID)
def createFromScrapedArticles(): common_logger.info("Creating ArticleDB") csv.field_size_limit(sys.maxint) articleDB = {} loaded_articles = 0 for article_file in file_scraped_articles_list: # open as csv file with headers article_csv = csv.DictReader(open(article_file, "rb"), delimiter=",") for row in article_csv: article_id = toUnicode(row["article_id"]) articleDB[article_id] = toUnicode(row["article_text"]) loaded_articles += 1 if(loaded_articles % 10000 == 0): common_logger.debug("loaded %d articles", loaded_articles) dump(articleDB, path=file_articledb) common_logger.info("Dumped ArticleDB successfully")
def load(path=file_articledb): """ Returns: dictionary in the format (article_id: article_text) """ common_logger.debug("loading articleDB from %s" %path) return pickle.load(open(path, "rb"))