コード例 #1
0
def tfidfPublications(pathToMemex, PageOrPubl):
    print("\tProcessing: %s" % PageOrPubl)
    # PART 1: loading OCR files into a corpus
    ocrFiles = functions.dicOfRelevantFiles(pathToMemex, ".json")
    citeKeys = list(ocrFiles.keys())  #[:500]

    print("\taggregating texts into documents...")
    corpusDic = {}
    for citeKey in citeKeys:
        docData = json.load(open(ocrFiles[citeKey]))
        for page, text in docData.items():
            # text as a document
            if PageOrPubl == "publications":
                if citeKey not in corpusDic:
                    corpusDic[citeKey] = []
                corpusDic[citeKey].append(text)

            # page cluster as a document
            elif PageOrPubl == "pages":
                pageNum = int(page)
                citeKeyNew = "%s_%05d" % (citeKey, roundUp(
                    pageNum, clusterSize))
                if citeKeyNew not in corpusDic:
                    corpusDic[citeKeyNew] = []
                corpusDic[citeKeyNew].append(text)

                # add the last page of cluster N to cluster N+1
                if pageNum % clusterSize == 0:
                    citeKeyNew = "%s_%05d" % (
                        citeKey, roundUp(pageNum + 1, clusterSize))
                    if citeKeyNew not in corpusDic:
                        corpusDic[citeKeyNew] = []
                    corpusDic[citeKeyNew].append(text)
            else:
                sys.exit(
                    "`PageOrPubl` parameter must be `publications` or `pages`")

    print("\t%d documents (%s) generated..." % (len(corpusDic), PageOrPubl))
    print("\tpreprocessing the corpus...")

    docList = []
    docIdList = []

    for docId, docText in corpusDic.items():
        if len(
                docText
        ) > 2:  # cluster of two pages mean that we would drop one last page
            doc = " ".join(docText)
            # clean doc
            doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc)
            doc = re.sub('\W+', ' ', doc)
            doc = re.sub('_+', ' ', doc)
            doc = re.sub('\d+', ' ', doc)
            doc = re.sub(' +', ' ', doc)
            # we can also drop documents with a small number of words
            # (for example, when there are many illustrations)
            # let's drop clusters that have less than 1,000 words (average for 6 pages ±2500-3000 words)
            if len(doc.split(" ")) > 1000:
                # update lists
                docList.append(doc)
                docIdList.append(docId)

    # PART 3: calculate tfidf for all loaded publications and distances
    print("\tgenerating tfidf matrix & distances...")
    stopWords = functions.loadMultiLingualStopWords(
        ["eng", "deu", "fre", "spa"])
    vectorizer = CountVectorizer(ngram_range=(1, 1),
                                 min_df=5,
                                 max_df=0.5,
                                 stop_words=stopWords)
    countVectorized = vectorizer.fit_transform(docList)
    tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    vectorized = tfidfTransformer.fit_transform(
        countVectorized)  # generates a sparse matrix
    cosineMatrix = cosine_similarity(vectorized)

    # PART 4: saving TFIDF --- only for publications!
    if PageOrPubl == "publications":
        print("\tsaving tfidf data...")
        tfidfTable = pd.DataFrame(vectorized.toarray(),
                                  index=docIdList,
                                  columns=vectorizer.get_feature_names())
        tfidfTable = tfidfTable.transpose()
        print("\ttfidfTable Shape: ", tfidfTable.shape)
        tfidfTableDic = tfidfTable.to_dict()

        tfidfTableDicFilt = filterTfidfDictionary(tfidfTableDic, 0.05, "more")
        pathToSave = os.path.join(pathToMemex,
                                  "results_tfidf_%s.dataJson" % PageOrPubl)
        with open(pathToSave, 'w', encoding='utf8') as f9:
            json.dump(tfidfTableDicFilt,
                      f9,
                      sort_keys=True,
                      indent=4,
                      ensure_ascii=False)

    # PART 4: saving cosine distances --- for both publications and page clusters
    print("\tsaving cosine distances data...")
    cosineTable = pd.DataFrame(cosineMatrix)
    print("\tcosineTable Shape: ", cosineTable.shape)
    cosineTable.columns = docIdList
    cosineTable.index = docIdList
    cosineTableDic = cosineTable.to_dict()

    tfidfTableDicFilt = filterTfidfDictionary(cosineTableDic, 0.25, "more")
    pathToSave = os.path.join(pathToMemex,
                              "results_cosineDist_%s.dataJson" % PageOrPubl)
    with open(pathToSave, 'w', encoding='utf8') as f9:
        json.dump(tfidfTableDicFilt,
                  f9,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False)
コード例 #2
0
def generateTfIdfWordClouds(pathToMemex):
    # PART 1: loading OCR files into a corpus
    ocrFiles = functions.dicOfRelevantFiles(pathToMemex, ".json")
    citeKeys = list(ocrFiles.keys())  #[:500]

    print("\taggregating texts into documents...")
    docList = []
    docIdList = []

    for citeKey in citeKeys:
        docData = json.load(open(ocrFiles[citeKey], "r", encoding="utf8"))

        docId = citeKey
        doc = " ".join(docData.values())

        # clean doc
        doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc)
        doc = re.sub('\W+', ' ', doc)
        doc = re.sub('_+', ' ', doc)
        doc = re.sub('\d+', ' ', doc)
        doc = re.sub(' +', ' ', doc)

        # update lists
        docList.append(doc)
        docIdList.append(docId)

    print("\t%d documents generated..." % len(docList))

    # PART 2: calculate tfidf for all loaded publications and distances
    print("\tgenerating tfidf matrix & distances...")
    stopWords = functions.loadMultiLingualStopWords(["deu", "eng", "fre"])
    vectorizer = CountVectorizer(ngram_range=(1, 1),
                                 min_df=2,
                                 max_df=0.5,
                                 stop_words=stopWords)
    countVectorized = vectorizer.fit_transform(docList)
    tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    vectorized = tfidfTransformer.fit_transform(
        countVectorized)  # generates a sparse matrix

    print("\tconverting and filtering tfidf data...")
    tfidfTable = pd.DataFrame(vectorized.toarray(),
                              index=docIdList,
                              columns=vectorizer.get_feature_names())
    tfidfTable = tfidfTable.transpose()
    tfidfTableDic = tfidfTable.to_dict()
    tfidfTableDic = filterTfidfDictionary(tfidfTableDic, 0.03, "more")

    #tfidfTableDic = json.load(open("/Users/romanovienna/Dropbox/6.Teaching_New/BUILDING_MEMEX_COURSE/_memex_sandbox/_data/results_tfidf_publications.dataJson"))

    # PART 4: generating wordclouds
    print("\tgenerating wordclouds...")
    wc = WordCloud(
        width=1000,
        height=600,
        background_color="white",
        random_state=2,
        relative_scaling=
        0.5,  #color_func=lambda *args, **kwargs: (179,0,0)) # single color
        #colormap="copper") # Oranges, Reds, YlOrBr, YlOrRd, OrRd; # copper
        colormap="autumn")  # binary, gray
    # https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html

    counter = len(tfidfTableDic)
    citeKeys = list(tfidfTableDic.keys())
    random.shuffle(citeKeys)

    for citeKey in citeKeys:
        savePath = functions.generatePublPath(pathToMemex, citeKey)
        savePath = os.path.join(savePath, "%s_wCloud.jpg" % citeKey)

        if not os.path.isfile(savePath):
            wc.generate_from_frequencies(tfidfTableDic[citeKey])
            # plotting
            plt.imshow(wc, interpolation="bilinear")
            plt.axis("off")
            #plt.show() # this line shows the plot
            plt.savefig(savePath, dpi=200, bbox_inches='tight')

            print("\t%s (%d left...)" % (citeKey, counter))
            counter -= 1

        else:
            print("\t%s --- already done" % (citeKey))
            counter -= 1