def docSummaryEx1(document, ordered = False, language = 'portuguese'): if isinstance(document, str): documentString = getString(document) document = getSentences(documentString, language) invIndex = invertedIndex(document) graphDict = createGraph(document, invIndex) pRankDict = pageRank(graphDict) topSentences = getTopSentences(pRankDict, document, ordered) return topSentences
def getFeatures(documentString, documentSentences): features = [] invIndex = invertedIndex(documentSentences) graph = createGraph(documentSentences, invIndex) priorDictSim = nodePriorSimilarity(invIndex, documentSentences, documentString) priorDictDegree = nodePriorDegree(graph, documentSentences) priorDictPosition = nodePriorPosition(documentSentences, documentString) priorDictScores = nodePriorScores(invIndex, documentSentences, documentString) for i in range(0, len(documentSentences)): sentenceScore = [] sentenceScore.append(priorDictSim[i]) sentenceScore.append(priorDictDegree[i]) sentenceScore.append(priorDictPosition[i]) sentenceScore.append(priorDictScores[i]) features.append(sentenceScore) return features
def docSummaryEx3(document, weightsList): if isinstance(document, str): documentString = getString(document) document = getSentences(documentString) invIndex = invertedIndex(document) graphDict = createGraph(document, invIndex) documentScore = {} #Calculate features and update sentences scores priorDictSim = nodePriorSimilarity(invIndex, document, documentString) priorDictDegree = nodePriorDegree(invIndex, document) priorDictPosition = nodePriorPosition(document, documentString) priorDictScores = nodePriorScores(invIndex, document, documentString) for node in range(0, len(document)): documentScore[node] = priorDictSim[node] * weightsList[ 0] + priorDictDegree[node] * weightsList[1] + priorDictPosition[ node] * weightsList[2] + priorDictScores[node] * weightsList[3] topSentences = getTopSentences(documentScore, document) return topSentences
def docSummaryEx2(document): if isinstance(document, str): documentString = getString(document) document = getSentences(documentString) invIndex = invertedIndex(document) graphDict = createGraph(document, invIndex) #Prior Functions priorDict = nodePriorPosition(document, documentString) #priorDict = nodePriorSimilarity(invIndex, document, documentString) #priorDict = nodePriorDegree(graphDict, document) #priorDict = nodePriorScores(invIndex, document, documentString) #priorDict = nodePriorNoun(document) #Weight Functions weightDict = edgeWeightScores(graphDict, invIndex, document, documentString) #weightDict = edgeWeightSimilarity(invIndex, graphDict, document) #weightDict = edgeWeightNoun(graphDict, document) pRankDict = pageRankOpt(graphDict, priorDict, weightDict) topSentences = getTopSentences(pRankDict, document) return topSentences
def smartCrawl(top_k, count, pool_thre, jaccard_thre, threads, budget, api, sampledata, localdata, hiddendata): """ Given a budget ofb queries, SMARTCRAWL first constructs a query pool based on the local database and then iteratively issues b queries to the hidden database such that the union of the query results can cover the maximum number of records in the local database. Finally, it performs entity resolution between the local database and the crawled records. ----**DeepER: Deep Entity Resolution** :param top_k: top-k constraint of specific api :param count: size of hidden database :param pool_thre: threshold of queries' frequency :param jaccard_thre: jaccard threshold :param threads: numbers of queries issued at each iteration :param budget: the budget of api call times :param api: An implementation of simapi for specific api. :param sampledata: SampleData object :param localdata: LocalData object :param hiddendata: HiddenData object :return: """ time_s = timeit.default_timer() sample = sampledata.getSample() D1_ids, D1_query, D1_er = localdata.getlocalData() sample_rate = 1.0 * len(sample) / count Dratio = 1.0 * len(D1_ids) / count time_e = timeit.default_timer() print >> perr, time_e - time_s, 'data loaded.' time_s = timeit.default_timer() initQueries = utils.queryGene(D1_query, pool_thre) time_e = timeit.default_timer() print >> perr, time_e - time_s, 'query pool finished.' #####inverted index ##### time_s = timeit.default_timer() D1index = utils.invertedIndex(initQueries, D1_query) initQueries, D1index = utils.add_naiveIndex(initQueries, D1_query, D1index) sampleindex = utils.invertedIndex(initQueries, sample) time_e = timeit.default_timer() print >> perr, time_e - time_s, 'index building finished.' #####forward index ##### time_s = timeit.default_timer() findex = utils.forwardIndex(D1index) time_e = timeit.default_timer() print >> perr, time_e - time_s, 'forward index' ##### biased ##### D1_ids_deeper = copy.deepcopy(D1_ids) query_pool = utils.initScore_biased(sampleindex, top_k, sample_rate, Dratio, initQueries) flagNum = len(initQueries) - budget curcov = set() curmat = [] updateList = utils.updateList(D1index) while len(query_pool) > flagNum and len(query_pool) != 0 and len( curcov) < len(D1_ids): queries = [] while len(queries) < threads: if len(query_pool) > 0: top = query_pool.popitem() if updateList[top[0]] != 0: if len(sampleindex[top[0]]) <= top_k * sample_rate: if len(sampleindex[top[0]]) == 0 and len( D1index[top[0]]) > (top_k * Dratio): new_priority = top[1] - updateList[ top[0]] * top_k * Dratio / len(D1index[top[0]]) else: new_priority = top[1] - updateList[top[0]] else: new_priority = top[1] - updateList[ top[0]] * top_k * sample_rate / len( sampleindex[top[0]]) query_pool.additem(top[0], new_priority) updateList[top[0]] = 0 continue else: queries.append(list(top[0])) else: break cur_raw_result = api.callMulAPI(queries) cur_er_result = hiddendata.proResult(cur_raw_result) matched_ids, matched_pair = utils.results_simjoin( cur_er_result, D1_er, jaccard_thre) removed_ids = D1_ids_deeper.intersection(matched_ids) for d in removed_ids: for q in findex[d]: updateList[q] += 1 D1_ids_deeper.difference_update(matched_ids) curcov = curcov.union(matched_ids) curmat.extend(matched_pair) print len(cur_raw_result), ' results returned, ', len( matched_ids), ' local records covered at this iteration. ', \ len(hiddendata.getMergeResult()), 'different results returned, ', len( curcov), ' local records covered totally.' api.getSession().close() hiddendata.setMatchPair(curmat) hiddendata.saveMatchPair() hiddendata.saveResult()