def main(): test = Helper.getDataFromDir( 'ake-datasets-master/datasets/500N-KPCrowd/test', mode='list') testStr = listOfTaggedToString(test) # test = dict(zip(list(test.keys()), list(test.values()))) info = getInfo(test) cands = getAllCandidates(test, deliver_as='sentences') #kfs = getPageRankOfDataset(test) rrfScores = {} for i, doc_name in enumerate(test.keys()): # cands = buildGramsUpToN(test[doc_name], 3) # g = buildGraph(cands, info['model']) # pr = computeWeightedPR(g, i, info, n_iter=15) params = calculateParameters( testStr[i], info['score'][doc_name], list(itertools.chain.from_iterable(cands[i])), pr=None) #kfs[doc_name]) # , pr) print(params) rrfScores[doc_name] = list(getRecipRankFusionScore(params).keys()) print('rrfScores', rrfScores) meanAPre, meanPre, meanRe, meanF1 = Helper.results( rrfScores, 'ake-datasets-master/datasets/500N-KPCrowd/references' '/test.reader.stem.json') print(f'Mean Avg Pre for {len(rrfScores.keys())} documents: ', meanAPre) print(f'Mean Precision for {len(rrfScores.keys())} documents: ', meanPre) print(f'Mean Recall for {len(rrfScores.keys())} documents: ', meanRe) print(f'Mean F1 for {len(rrfScores.keys())} documents: ', meanF1)
def getPageRankOfDataset(ds): if type(ds) == str: ds = Helper.getDataFromDir(ds, mode='list') with ProcessPoolExecutor(max_workers=cpu_count( logical=Helper.logical())) as executor: fts = {} kfs = {} dsStr = listOfTaggedToString(ds) collection = list(map(lambda doc: doc.lower(), dsStr)) cands = getAllCandidates(ds, deliver_as='sentences') info = getInfo(ds) for i, file in enumerate(ds): fts.update({ executor.submit(getKeyphrases, dsStr[i].lower(), info, candidates=cands[i], doc_i=i, collection=collection): file }) for future in as_completed(fts): file = fts[future] kfs.update({file: future.result()}) return kfs
def main(): TEST = False test = Helper.getDataFromDir( 'ake-datasets-master/datasets/500N-KPCrowd/train', mode='list') # test = dict(zip(list(test.keys())[:2], list(test.values())[:2])) # model = KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec') # model = None # tfidf = {'terms': terms, 'scoreArr': scoreArr, 'model': model} # if windows and not do logical (checks v >= 3.8.0) if (system() == 'Windows' and not Helper.logical()) or TEST: # True:# single_process(test) # if linux/mac or windows with v >= 3.8.0 else: multi_process(test)
def multi_process(ds): # cpu_count(logical=Helper.logical()) pr = getPageRankOfDataset(ds) kfs = dict( zip(pr.keys(), [ list(OrderedDict(Helper.dictToOrderedList(p, rev=True)).keys()) for p in pr.values() ])) meanAPre, meanPre, meanRe, meanF1 = Helper.results( kfs, 'ake-datasets-master/datasets/500N-KPCrowd/references/train' '.reader.stem.json') print(f'Mean Avg Pre for {len(kfs.keys())} documents: ', meanAPre) print(f'Mean Precision for {len(kfs.keys())} documents: ', meanPre) print(f'Mean Recall for {len(kfs.keys())} documents: ', meanRe) print(f'Mean F1 for {len(kfs.keys())} documents: ', meanF1)
def main(): cats = [ 'Technology', 'Politics', 'Sports' # 'World', 'US', 'HomePage','Business', 'Economy', 'Soccer', 'Science', 'Environment', 'Travel', 'Arts', 'Books' ] text_file = open("page.html", "a+") text_file.truncate(0) global_doc = '' # build global doc for category in cats: documents = fetchCategory(category) global_doc += ' '.join(list(documents.values())) gCat = 'Global' results = calcKeywords({'0': global_doc}, gCat) kf = dict(zip(results.keys(), [d.keys() for d in results.values()])) createCSV(results, gCat) plotKeyphrases(gCat) generateWordCloud(results, gCat) text_file.write(generateHTML(kf, gCat)) print(results) for category in cats: documents = fetchCategory(category) keywords_with_score = calcKeywords(documents, category) keywords = dict( zip(keywords_with_score.keys(), [d.keys() for d in keywords_with_score.values()])) Helper.printDict(keywords_with_score) # keyword # word -> score createCSV(keywords_with_score, category) plotKeyphrases(category) generateWordCloud(keywords_with_score, category) text_file.write(generateHTML(keywords, category)) #relevantInCategory(category, keywords_with_score, results) text_file.close()
def getRecipRankFusionScore(words): RRFScore = {} for name, scores in words.items(): wordScore = [] for param in scores: wordScore.append(1 / (50 + param)) RRFScore.update({name: sum(wordScore)}) return dict(Helper.dictToOrderedList(RRFScore, rev=True)[:50])
def calcKeywords(documents, category): createNewsFiles(documents, category) # keywords = run(f'news/{category}') keywords = getPageRankOfDataset(f'news/{category}') keywords = { '0': OrderedDict(Helper.dictToOrderedList(keywords['0'], rev=True)) } return keywords
def priorCandLocation(cand: str, doc: str): # normalized location, inverted because sentences in first keyphrases # are more relevant nsw_doc = Helper.filterStopWords(doc) first_match = nsw_doc.find(cand) / len(nsw_doc) last_match = nsw_doc.rfind(cand) / len(nsw_doc) spread = last_match - first_match if spread != 0: return spread * len(cand) else: return first_match * len(cand)
def calculateTF_IDFAndNormalize(datasetFileName: str, backgroundCollectionFileName: str): ds = Helper.getDataFromDir(datasetFileName) extra = Helper.getDataFromDir(backgroundCollectionFileName) vec = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 3)) vec.fit(extra.values()) X = vec.fit_transform(ds.values()) terms = vec.get_feature_names() tf_idf = X.toarray() new_tf_idf = [[ doc[j] * (len(terms[j]) / len(terms[j].split())) for j in range(len(terms)) ] for doc in tf_idf] max_val = [max(tf_idf[i]) for i in range(len(tf_idf))] max_val = max(max_val) # normalize each row (document) with the respective max value # new_tf_idf = [[doc[j] / max_val[i] for j in range(len(terms))] for i, doc in enumerate(new_tf_idf)] new_tf_idf = [[doc[j] / max_val for j in range(len(terms))] for i, doc in enumerate(new_tf_idf)] return terms, new_tf_idf
def single_process(ds): kfs = {} info = getInfo(ds) # lower case our collection as list collection = list(map(lambda doc: doc.lower(), ds.values())) for i, file in enumerate(ds): d_pr = getKeyphrases(ds[file].lower(), info, doc_i=i, collection=collection) kfs.update({file: d_pr}) meanAPre, meanPre, meanRe, meanF1 = Helper.results( kfs, 'ake-datasets-master/datasets/500N-KPCrowd/references' '/train.reader.stem.json') print(f'Mean Avg Pre for {len(kfs.keys())} documents: ', meanAPre) print(f'Mean Precision for {len(kfs.keys())} documents: ', meanPre) print(f'Mean Recall for {len(kfs.keys())} documents: ', meanRe) print(f'Mean F1 for {len(kfs.keys())} documents: ', meanF1)
def run(path): test = Helper.getDataFromDir(path, mode='list') testStr = listOfTaggedToString(test) print(test.keys()) # return # test = dict(zip(list(test.keys()), list(test.values()))) info = getInfo(test) cands = getAllCandidates(test, deliver_as='sentences') rrfScores = {} for i, doc_name in enumerate(test.keys()): # cands = buildGramsUpToN(test[doc_name], 3) # g = buildGraph(cands, info['model']) # pr = computeWeightedPR(g, i, info, n_iter=15) params = calculateParameters( testStr[i], info['score'][doc_name], list(itertools.chain.from_iterable(cands[i]))) # , pr) print(params) rrfScores[doc_name] = getRecipRankFusionScore(params) return rrfScores
allCandidatesTest) for doc_index, doc_name in enumerate(train.keys()): allParams = calculateParameters(allCandidatesTrain[doc_index], trainStr[doc_index], bm25train[doc_name]) if not targets[doc_name].count(0) == len(targets[doc_name]): p_classifier.fit(allParams, targets[doc_name]) print('predict') precision = [] recall = [] f1 = [] ap = [] tr = Helper.getTrueKeyphrases( 'ake-datasets-master/datasets/500N-KPCrowd/references/test.reader.stem.json' ) kfs = {} for doc_index, doc_name in enumerate(test.keys()): params = calculateParameters(allCandidatesTest[doc_index], testStr[doc_index], bm25test[doc_name]) predicted = p_classifier.predict(params) plane = p_classifier.decision_function(params) true = testTargets[doc_name] print('PERCEPTRON') print(predicted) print('[P2]', plane) print('REALITY')
def computeWeightedPR(g: networkx.Graph, doc_i: int, info: dict, n_iter=1, d=0.15): pr = dict(zip(g.nodes, [1 / len(g.nodes) for _ in range(len(g.nodes))])) N = len(g.nodes) rate = 0.00001 # cand := pi, calculate PR(pi) for all nodes for _ in range(n_iter): pr_pi = {} for pi in g.nodes(): pi_links = list(map(lambda e: e[1], g.edges(pi))) # sum_pr_pj = sum([pr[e[1]] / len(g.edges(e[1])) for e in pi_links]) # todos os candidatos ou so os que estao ligados ao cand agora # ∑pjPrior(pj) rst_array = [] # div = sum([prior(doc_i, w, info) for w in pi_links]) div = sum([prior(doc_i, c, info) for c in g.nodes]) if div == 0: div = 1 for pj in pi_links: pj_links = list(map(lambda e: e[1], g.edges(pj))) bot = sum([g.edges[pj, pk]['weight'] for pk in pj_links]) if bot == 0: # print(f'cand {pi} has 0 div, links = {pi_links}, doc_i={doc_i}') bot = 1 weight = g.edges[pj, pi]['weight'] if weight != 0.0: top = pr[pj] * weight rst_array.append(top / bot) rst = sum(rst_array) # div = sum([priorCandLocation(w, doc) for w in pi_links]) pr_pi[pi] = d * (prior(doc_i, pi, info) / div) + (1 - d) * rst # pr_pi[pi] = d/N + (1 - d) * rst # pr_pi[pi] = d * (priorCandLocation(pi, doc) / div) + (1 - d) * rst # pr_pi[cand] = d / N + (1 - d) * sum_pr_pj isConverged = Helper.checkConvergence(pr.values(), pr_pi.values(), N, rate) # print(Helper.dictToOrderedList(pr_pi, rev=True)) # print('Converged? ', isConverged) # print('sum of PR', sum_pr) pr = pr_pi if isConverged: break print(f'{doc_i} finished - {Helper.dictToOrderedList(pr, rev=True)}') sum_pr = sum(pr.values()) print(f'{doc_i} sum of PR = ', sum_pr) # with open('pr.csv', 'w', newline='') as f: # writer = csv.writer(f) # for k, v in pr.items(): # writer.writerow([k, v]) return pr