Example #1
0
def serialGetWordCorpus(articleDB):
    word_corpus = {}
    for article_id, text in articleDB.items():
        word_corpus[article_id] = TextTools.getCleanedWords(text, stem_words=stem_words, removeNumbers=removeNumbers)
        if len(word_corpus) % 1000 == 0:
            common_logger.debug("converted " + str(len(word_corpus)) + " articles to words")
    return word_corpus
Example #2
0
def load(path=file_acronymdb):
    """
    acronymdb is a dictionary in the format:
    (acronym: [list of [acronym_expansion, article_id]])
    """
    common_logger.debug("loading acronymDB from %s" % path)
    return pickle.load(open(path, "rb"))
Example #3
0
def createFromScrapedDefinitions():
    common_logger.info("Creating AcronymDB")
    csv.field_size_limit(sys.maxint)

    acronymDB = {}
    loaded_acronyms = 0
    for definition_file in file_scraped_definitions_list:
        # open as csv file with headers
        acronym_csv = csv.DictReader(
            open(definition_file, "rb"), delimiter=",")

        for row in acronym_csv:
            acronym = toUnicode(row["acronym"])
            acronym_expansion = toUnicode(row["acronym_expansion"])
            article_id = toUnicode(row["article_id"])
            if(acronym not in acronymDB):
                acronymDB[acronym] = []
            acronymDB[acronym].append([acronym_expansion
                                       .strip().lower().replace('-', ' '), article_id])
            # , row["article_title"]]) # title was part of old format
            loaded_acronyms += 1
            if(loaded_acronyms % 10000 == 0):
                common_logger.debug("loaded %d acronyms", loaded_acronyms)

    common_logger.info("adding def_count values to acronymDB")
    defs_per_acronym = [0] * 1000
    insts_per_def = [0] * 1000
    #num_acronyms = len(acronymDB)
    for acronym, values_for_this_acronym in acronymDB.items():
        values_for_this_acronym = sorted(
            values_for_this_acronym, key=lambda x: x[0])

        def_count = 0
        inst_count = 0
        expansion_of_last_acronym = values_for_this_acronym[0][0]
        #, article_title]\ # title was part of old format in the line below
        for index, [acronym_expansion, article_id]\
                in enumerate(values_for_this_acronym):
            if AcronymExpansion.startsSameWay(acronym_expansion, expansion_of_last_acronym):
                inst_count += 1
                values_for_this_acronym[index].append(def_count)
                values_for_this_acronym[index][0] = expansion_of_last_acronym
            else:
                insts_per_def[min(inst_count, len(insts_per_def) - 1)] += 1
                inst_count = 0
                def_count += 1
                expansion_of_last_acronym = acronym_expansion
                values_for_this_acronym[index].append(def_count)
        defs_per_acronym[min(def_count, len(defs_per_acronym) - 1)] += 1
        acronymDB[acronym] = numpy.array(values_for_this_acronym)

    dump(acronymDB)
    common_logger.info("Dumped AcronymDB successfully")
Example #4
0
def createArticleIdToLdaDict(word_corpus, dictionary, lda_model):
    common_logger.info("Creating article_id -> lda_vector dictionary")
    article_lda = {}
    index = 0
    for article_id in word_corpus.keys():
        bow = dictionary.doc2bow(word_corpus[article_id])
        lda_vec = lda_model[bow]
        article_lda[article_id] = lda_vec
        index += 1
        if index % 1000 == 0:
            common_logger.debug("done with %d articles", index)
    common_logger.info("saving article_id -> lda_vector dictionary")
    pickle.dump(article_lda, open(file_lda_articleIDToLDA, "wb"), protocol=2)
    return article_lda
Example #5
0
def downloadPdfs():
    with open(file_ScienceWise_index_train, "r") as file:
        reader = csv.DictReader(file, delimiter=",")
        for line in reader:
            pdfID = line["ARXIV_ID"]
            filename = _arxivIDToFilename(pdfID)
            try:
                if(os.path.exists(folder_scienceWise_pdfs + filename)):
                    common_logger.debug("present already " + pdfID)
                    continue
                _downloadPdf(pdfID)
                common_logger.debug("successfully downloaded " + pdfID)
                time.sleep(5 * 60)
            except:
                common_logger.exception("Error in file " + pdfID)
Example #6
0
def createFromScrapedArticles():
    common_logger.info("Creating ArticleDB")
    csv.field_size_limit(sys.maxint)

    articleDB = {}
    loaded_articles = 0
    for article_file in file_scraped_articles_list:
        # open as csv file with headers
        article_csv = csv.DictReader(open(article_file, "rb"), delimiter=",")

        for row in article_csv:
            article_id = toUnicode(row["article_id"])
            articleDB[article_id] = toUnicode(row["article_text"])
            loaded_articles += 1
            if(loaded_articles % 10000 == 0):
                common_logger.debug("loaded %d articles", loaded_articles)

    dump(articleDB, path=file_articledb)
    common_logger.info("Dumped ArticleDB successfully")
Example #7
0
def load(path=file_articledb):
    """
    Returns: dictionary in the format (article_id: article_text)
    """
    common_logger.debug("loading articleDB from %s" %path)
    return pickle.load(open(path, "rb"))