Esempio n. 1
0
def docSummaryEx1(document, ordered = False, language = 'portuguese'):
	if isinstance(document, str):
		documentString = getString(document)
		document = getSentences(documentString, language)
	invIndex = invertedIndex(document)
	graphDict = createGraph(document, invIndex)
	pRankDict = pageRank(graphDict)
	topSentences = getTopSentences(pRankDict, document, ordered)
	return topSentences
def getFeatures(documentString, documentSentences):
    features = []
    invIndex = invertedIndex(documentSentences)
    graph = createGraph(documentSentences, invIndex)

    priorDictSim = nodePriorSimilarity(invIndex, documentSentences,
                                       documentString)
    priorDictDegree = nodePriorDegree(graph, documentSentences)
    priorDictPosition = nodePriorPosition(documentSentences, documentString)
    priorDictScores = nodePriorScores(invIndex, documentSentences,
                                      documentString)

    for i in range(0, len(documentSentences)):
        sentenceScore = []
        sentenceScore.append(priorDictSim[i])
        sentenceScore.append(priorDictDegree[i])
        sentenceScore.append(priorDictPosition[i])
        sentenceScore.append(priorDictScores[i])
        features.append(sentenceScore)
    return features
def docSummaryEx3(document, weightsList):
    if isinstance(document, str):
        documentString = getString(document)
        document = getSentences(documentString)
    invIndex = invertedIndex(document)
    graphDict = createGraph(document, invIndex)
    documentScore = {}

    #Calculate features and update sentences scores
    priorDictSim = nodePriorSimilarity(invIndex, document, documentString)
    priorDictDegree = nodePriorDegree(invIndex, document)
    priorDictPosition = nodePriorPosition(document, documentString)
    priorDictScores = nodePriorScores(invIndex, document, documentString)

    for node in range(0, len(document)):
        documentScore[node] = priorDictSim[node] * weightsList[
            0] + priorDictDegree[node] * weightsList[1] + priorDictPosition[
                node] * weightsList[2] + priorDictScores[node] * weightsList[3]

    topSentences = getTopSentences(documentScore, document)
    return topSentences
def docSummaryEx2(document):
	if isinstance(document, str):
		documentString = getString(document)
		document = getSentences(documentString)
	invIndex = invertedIndex(document)
	graphDict = createGraph(document, invIndex)

	#Prior Functions
	priorDict = nodePriorPosition(document, documentString)
	#priorDict = nodePriorSimilarity(invIndex, document, documentString)
	#priorDict = nodePriorDegree(graphDict, document)
	#priorDict = nodePriorScores(invIndex, document, documentString)
	#priorDict = nodePriorNoun(document)

	#Weight Functions
	weightDict = edgeWeightScores(graphDict, invIndex, document, documentString)
	#weightDict = edgeWeightSimilarity(invIndex, graphDict, document)
	#weightDict = edgeWeightNoun(graphDict, document)

	pRankDict = pageRankOpt(graphDict, priorDict, weightDict)
	topSentences = getTopSentences(pRankDict, document)
	return topSentences
Esempio n. 5
0
def smartCrawl(top_k, count, pool_thre, jaccard_thre, threads, budget, api,
               sampledata, localdata, hiddendata):
    """
    Given a budget ofb queries, SMARTCRAWL first constructs a query pool based on the local database and then
    iteratively issues b queries to the hidden database such that the union of the query results can cover
    the maximum number of records in the local database. Finally, it performs entity resolution between the
    local database and the crawled records.
    ----**DeepER: Deep Entity Resolution**

    :param top_k: top-k constraint of specific api
    :param count: size of hidden database
    :param pool_thre: threshold of queries' frequency
    :param jaccard_thre: jaccard threshold
    :param threads: numbers of queries issued at each iteration
    :param budget: the budget of api call times
    :param api: An implementation of simapi for specific api.
    :param sampledata: SampleData object
    :param localdata: LocalData object
    :param hiddendata: HiddenData object
    :return:
    """
    time_s = timeit.default_timer()
    sample = sampledata.getSample()
    D1_ids, D1_query, D1_er = localdata.getlocalData()

    sample_rate = 1.0 * len(sample) / count
    Dratio = 1.0 * len(D1_ids) / count

    time_e = timeit.default_timer()
    print >> perr, time_e - time_s, 'data loaded.'

    time_s = timeit.default_timer()
    initQueries = utils.queryGene(D1_query, pool_thre)
    time_e = timeit.default_timer()
    print >> perr, time_e - time_s, 'query pool finished.'

    #####inverted index #####
    time_s = timeit.default_timer()
    D1index = utils.invertedIndex(initQueries, D1_query)
    initQueries, D1index = utils.add_naiveIndex(initQueries, D1_query, D1index)
    sampleindex = utils.invertedIndex(initQueries, sample)
    time_e = timeit.default_timer()
    print >> perr, time_e - time_s, 'index building finished.'
    #####forward index #####
    time_s = timeit.default_timer()
    findex = utils.forwardIndex(D1index)
    time_e = timeit.default_timer()
    print >> perr, time_e - time_s, 'forward index'

    ##### biased #####
    D1_ids_deeper = copy.deepcopy(D1_ids)
    query_pool = utils.initScore_biased(sampleindex, top_k, sample_rate,
                                        Dratio, initQueries)
    flagNum = len(initQueries) - budget

    curcov = set()
    curmat = []
    updateList = utils.updateList(D1index)

    while len(query_pool) > flagNum and len(query_pool) != 0 and len(
            curcov) < len(D1_ids):
        queries = []
        while len(queries) < threads:
            if len(query_pool) > 0:
                top = query_pool.popitem()
                if updateList[top[0]] != 0:
                    if len(sampleindex[top[0]]) <= top_k * sample_rate:
                        if len(sampleindex[top[0]]) == 0 and len(
                                D1index[top[0]]) > (top_k * Dratio):
                            new_priority = top[1] - updateList[
                                top[0]] * top_k * Dratio / len(D1index[top[0]])
                        else:
                            new_priority = top[1] - updateList[top[0]]
                    else:
                        new_priority = top[1] - updateList[
                            top[0]] * top_k * sample_rate / len(
                                sampleindex[top[0]])
                    query_pool.additem(top[0], new_priority)
                    updateList[top[0]] = 0
                    continue
                else:
                    queries.append(list(top[0]))
            else:
                break

        cur_raw_result = api.callMulAPI(queries)
        cur_er_result = hiddendata.proResult(cur_raw_result)

        matched_ids, matched_pair = utils.results_simjoin(
            cur_er_result, D1_er, jaccard_thre)
        removed_ids = D1_ids_deeper.intersection(matched_ids)
        for d in removed_ids:
            for q in findex[d]:
                updateList[q] += 1

        D1_ids_deeper.difference_update(matched_ids)
        curcov = curcov.union(matched_ids)
        curmat.extend(matched_pair)
        print len(cur_raw_result), ' results returned, ', len(
            matched_ids), ' local records covered at this iteration. ', \
            len(hiddendata.getMergeResult()), 'different results returned, ', len(
            curcov), ' local records covered totally.'

    api.getSession().close()
    hiddendata.setMatchPair(curmat)
    hiddendata.saveMatchPair()
    hiddendata.saveResult()