Ejemplo n.º 1
0
def main():
    test = Helper.getDataFromDir(
        'ake-datasets-master/datasets/500N-KPCrowd/test', mode='list')
    testStr = listOfTaggedToString(test)
    # test = dict(zip(list(test.keys()), list(test.values())))
    info = getInfo(test)
    cands = getAllCandidates(test, deliver_as='sentences')
    #kfs = getPageRankOfDataset(test)
    rrfScores = {}

    for i, doc_name in enumerate(test.keys()):
        # cands =  buildGramsUpToN(test[doc_name], 3)
        # g = buildGraph(cands, info['model'])
        # pr = computeWeightedPR(g, i, info, n_iter=15)
        params = calculateParameters(
            testStr[i],
            info['score'][doc_name],
            list(itertools.chain.from_iterable(cands[i])),
            pr=None)  #kfs[doc_name])  # , pr)
        print(params)
        rrfScores[doc_name] = list(getRecipRankFusionScore(params).keys())

    print('rrfScores', rrfScores)
    meanAPre, meanPre, meanRe, meanF1 = Helper.results(
        rrfScores, 'ake-datasets-master/datasets/500N-KPCrowd/references'
        '/test.reader.stem.json')
    print(f'Mean Avg Pre for {len(rrfScores.keys())} documents: ', meanAPre)
    print(f'Mean Precision for {len(rrfScores.keys())} documents: ', meanPre)
    print(f'Mean Recall for {len(rrfScores.keys())} documents: ', meanRe)
    print(f'Mean F1 for {len(rrfScores.keys())} documents: ', meanF1)
Ejemplo n.º 2
0
def getPageRankOfDataset(ds):
    if type(ds) == str:
        ds = Helper.getDataFromDir(ds, mode='list')
    with ProcessPoolExecutor(max_workers=cpu_count(
            logical=Helper.logical())) as executor:
        fts = {}
        kfs = {}
        dsStr = listOfTaggedToString(ds)
        collection = list(map(lambda doc: doc.lower(), dsStr))
        cands = getAllCandidates(ds, deliver_as='sentences')
        info = getInfo(ds)
        for i, file in enumerate(ds):
            fts.update({
                executor.submit(getKeyphrases,
                                dsStr[i].lower(),
                                info,
                                candidates=cands[i],
                                doc_i=i,
                                collection=collection):
                file
            })
        for future in as_completed(fts):
            file = fts[future]
            kfs.update({file: future.result()})
    return kfs
Ejemplo n.º 3
0
def main():
    TEST = False
    test = Helper.getDataFromDir(
        'ake-datasets-master/datasets/500N-KPCrowd/train', mode='list')
    # test = dict(zip(list(test.keys())[:2], list(test.values())[:2]))

    # model = KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec')
    # model = None
    # tfidf = {'terms': terms, 'scoreArr': scoreArr, 'model': model}
    # if windows and not do logical (checks v >= 3.8.0)
    if (system() == 'Windows' and not Helper.logical()) or TEST:  # True:#
        single_process(test)
    # if linux/mac or windows with v >= 3.8.0
    else:
        multi_process(test)
Ejemplo n.º 4
0
def multi_process(ds):
    # cpu_count(logical=Helper.logical())
    pr = getPageRankOfDataset(ds)
    kfs = dict(
        zip(pr.keys(), [
            list(OrderedDict(Helper.dictToOrderedList(p, rev=True)).keys())
            for p in pr.values()
        ]))
    meanAPre, meanPre, meanRe, meanF1 = Helper.results(
        kfs, 'ake-datasets-master/datasets/500N-KPCrowd/references/train'
        '.reader.stem.json')
    print(f'Mean Avg Pre for {len(kfs.keys())} documents: ', meanAPre)
    print(f'Mean Precision for {len(kfs.keys())} documents: ', meanPre)
    print(f'Mean Recall for {len(kfs.keys())} documents: ', meanRe)
    print(f'Mean F1 for {len(kfs.keys())} documents: ', meanF1)
Ejemplo n.º 5
0
def main():
    cats = [
        'Technology', 'Politics', 'Sports'
        # 'World', 'US', 'HomePage','Business', 'Economy', 'Soccer', 'Science', 'Environment', 'Travel', 'Arts', 'Books'
    ]
    text_file = open("page.html", "a+")
    text_file.truncate(0)

    global_doc = ''
    # build global doc
    for category in cats:
        documents = fetchCategory(category)
        global_doc += ' '.join(list(documents.values()))

    gCat = 'Global'
    results = calcKeywords({'0': global_doc}, gCat)
    kf = dict(zip(results.keys(), [d.keys() for d in results.values()]))
    createCSV(results, gCat)
    plotKeyphrases(gCat)
    generateWordCloud(results, gCat)
    text_file.write(generateHTML(kf, gCat))

    print(results)
    for category in cats:
        documents = fetchCategory(category)
        keywords_with_score = calcKeywords(documents, category)
        keywords = dict(
            zip(keywords_with_score.keys(),
                [d.keys() for d in keywords_with_score.values()]))
        Helper.printDict(keywords_with_score)

        # keyword
        # word -> score

        createCSV(keywords_with_score, category)

        plotKeyphrases(category)

        generateWordCloud(keywords_with_score, category)

        text_file.write(generateHTML(keywords, category))

        #relevantInCategory(category, keywords_with_score, results)

    text_file.close()
Ejemplo n.º 6
0
def getRecipRankFusionScore(words):
    RRFScore = {}
    for name, scores in words.items():
        wordScore = []
        for param in scores:
            wordScore.append(1 / (50 + param))
        RRFScore.update({name: sum(wordScore)})

    return dict(Helper.dictToOrderedList(RRFScore, rev=True)[:50])
Ejemplo n.º 7
0
def calcKeywords(documents, category):
    createNewsFiles(documents, category)

    # keywords = run(f'news/{category}')

    keywords = getPageRankOfDataset(f'news/{category}')
    keywords = {
        '0': OrderedDict(Helper.dictToOrderedList(keywords['0'], rev=True))
    }
    return keywords
Ejemplo n.º 8
0
def priorCandLocation(cand: str, doc: str):
    # normalized location, inverted because sentences in first keyphrases
    # are more relevant
    nsw_doc = Helper.filterStopWords(doc)
    first_match = nsw_doc.find(cand) / len(nsw_doc)
    last_match = nsw_doc.rfind(cand) / len(nsw_doc)
    spread = last_match - first_match
    if spread != 0:
        return spread * len(cand)
    else:
        return first_match * len(cand)
Ejemplo n.º 9
0
def calculateTF_IDFAndNormalize(datasetFileName: str,
                                backgroundCollectionFileName: str):
    ds = Helper.getDataFromDir(datasetFileName)
    extra = Helper.getDataFromDir(backgroundCollectionFileName)

    vec = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 3))
    vec.fit(extra.values())
    X = vec.fit_transform(ds.values())
    terms = vec.get_feature_names()
    tf_idf = X.toarray()

    new_tf_idf = [[
        doc[j] * (len(terms[j]) / len(terms[j].split()))
        for j in range(len(terms))
    ] for doc in tf_idf]
    max_val = [max(tf_idf[i]) for i in range(len(tf_idf))]
    max_val = max(max_val)
    # normalize each row (document) with the respective max value
    # new_tf_idf = [[doc[j] / max_val[i] for j in range(len(terms))] for i, doc in enumerate(new_tf_idf)]
    new_tf_idf = [[doc[j] / max_val for j in range(len(terms))]
                  for i, doc in enumerate(new_tf_idf)]
    return terms, new_tf_idf
Ejemplo n.º 10
0
def single_process(ds):
    kfs = {}
    info = getInfo(ds)
    # lower case our collection as list
    collection = list(map(lambda doc: doc.lower(), ds.values()))
    for i, file in enumerate(ds):
        d_pr = getKeyphrases(ds[file].lower(),
                             info,
                             doc_i=i,
                             collection=collection)
        kfs.update({file: d_pr})

    meanAPre, meanPre, meanRe, meanF1 = Helper.results(
        kfs, 'ake-datasets-master/datasets/500N-KPCrowd/references'
        '/train.reader.stem.json')
    print(f'Mean Avg Pre for {len(kfs.keys())} documents: ', meanAPre)
    print(f'Mean Precision for {len(kfs.keys())} documents: ', meanPre)
    print(f'Mean Recall for {len(kfs.keys())} documents: ', meanRe)
    print(f'Mean F1 for {len(kfs.keys())} documents: ', meanF1)
Ejemplo n.º 11
0
def run(path):
    test = Helper.getDataFromDir(path, mode='list')
    testStr = listOfTaggedToString(test)
    print(test.keys())
    # return
    # test = dict(zip(list(test.keys()), list(test.values())))
    info = getInfo(test)
    cands = getAllCandidates(test, deliver_as='sentences')
    rrfScores = {}

    for i, doc_name in enumerate(test.keys()):
        # cands =  buildGramsUpToN(test[doc_name], 3)
        # g = buildGraph(cands, info['model'])
        # pr = computeWeightedPR(g, i, info, n_iter=15)
        params = calculateParameters(
            testStr[i], info['score'][doc_name],
            list(itertools.chain.from_iterable(cands[i])))  # , pr)
        print(params)
        rrfScores[doc_name] = getRecipRankFusionScore(params)

    return rrfScores
Ejemplo n.º 12
0
    allCandidatesTest)

for doc_index, doc_name in enumerate(train.keys()):
    allParams = calculateParameters(allCandidatesTrain[doc_index],
                                    trainStr[doc_index], bm25train[doc_name])
    if not targets[doc_name].count(0) == len(targets[doc_name]):
        p_classifier.fit(allParams, targets[doc_name])

print('predict')

precision = []
recall = []
f1 = []
ap = []
tr = Helper.getTrueKeyphrases(
    'ake-datasets-master/datasets/500N-KPCrowd/references/test.reader.stem.json'
)
kfs = {}

for doc_index, doc_name in enumerate(test.keys()):
    params = calculateParameters(allCandidatesTest[doc_index],
                                 testStr[doc_index], bm25test[doc_name])

    predicted = p_classifier.predict(params)
    plane = p_classifier.decision_function(params)
    true = testTargets[doc_name]

    print('PERCEPTRON')
    print(predicted)
    print('[P2]', plane)
    print('REALITY')
Ejemplo n.º 13
0
def computeWeightedPR(g: networkx.Graph,
                      doc_i: int,
                      info: dict,
                      n_iter=1,
                      d=0.15):
    pr = dict(zip(g.nodes, [1 / len(g.nodes) for _ in range(len(g.nodes))]))
    N = len(g.nodes)
    rate = 0.00001

    # cand := pi, calculate PR(pi) for all nodes
    for _ in range(n_iter):
        pr_pi = {}
        for pi in g.nodes():
            pi_links = list(map(lambda e: e[1], g.edges(pi)))

            # sum_pr_pj = sum([pr[e[1]] / len(g.edges(e[1])) for e in pi_links])
            # todos os candidatos ou so os que estao ligados ao cand agora
            # ∑pjPrior(pj)
            rst_array = []

            # div = sum([prior(doc_i, w, info) for w in pi_links])
            div = sum([prior(doc_i, c, info) for c in g.nodes])
            if div == 0:
                div = 1

            for pj in pi_links:
                pj_links = list(map(lambda e: e[1], g.edges(pj)))
                bot = sum([g.edges[pj, pk]['weight'] for pk in pj_links])
                if bot == 0:
                    # print(f'cand {pi} has 0 div, links = {pi_links}, doc_i={doc_i}')
                    bot = 1
                weight = g.edges[pj, pi]['weight']
                if weight != 0.0:
                    top = pr[pj] * weight
                    rst_array.append(top / bot)
            rst = sum(rst_array)

            # div = sum([priorCandLocation(w, doc) for w in pi_links])

            pr_pi[pi] = d * (prior(doc_i, pi, info) / div) + (1 - d) * rst
            # pr_pi[pi] = d/N + (1 - d) * rst
            # pr_pi[pi] = d * (priorCandLocation(pi, doc) / div) + (1 - d) * rst
            # pr_pi[cand] = d / N + (1 - d) * sum_pr_pj

        isConverged = Helper.checkConvergence(pr.values(), pr_pi.values(), N,
                                              rate)

        # print(Helper.dictToOrderedList(pr_pi, rev=True))
        # print('Converged? ', isConverged)
        # print('sum of PR', sum_pr)

        pr = pr_pi

        if isConverged:
            break

    print(f'{doc_i} finished - {Helper.dictToOrderedList(pr, rev=True)}')
    sum_pr = sum(pr.values())
    print(f'{doc_i} sum of PR = ', sum_pr)
    # with open('pr.csv', 'w', newline='') as f:
    #     writer = csv.writer(f)
    #     for k, v in pr.items():
    #         writer.writerow([k, v])
    return pr