Esempio n. 1
0
def make_concepts_exp(id, path, sents, query):
    """
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())

    ## get sentence values
    sent_vals = prob_util.Counter()
    for sent in sents:
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)
        sent_vals[sent] = max(0, len(query_overlap))
        #if sent.relevance < 0.3: sent_vals[sent] = 0.0
        #else: sent_vals[sent] = 100000**sent.relevance
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = set()
        for concept in concepts:
            if util.is_just_stopwords(concept.split('_')): continue
            sent.concepts.add(concept)
    sent_vals = prob_util.normalize(sent_vals)

    ## get concept values
    concept_vals = prob_util.Counter()
    for sent in sents:
        for concept in sent.concepts:
            concept_vals[concept] += sent_vals[sent]            
    concept_vals = prob_util.normalize(concept_vals)
    
    iter = 0
    while True:
        iter += 1
        se = prob_util.entropy(sent_vals)
        ce = prob_util.entropy(concept_vals)
        print >>sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' %(iter, se, ce)
        if iter >= 1: break
        
        ## get sent vals again
        sent_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                sent_vals[sent] += concept_vals[concept]
        sent_vals = prob_util.normalize(sent_vals)
        
        ## get concept values
        concept_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                concept_vals[concept] += sent_vals[sent]            
        concept_vals = prob_util.normalize(concept_vals)
    
    sorted_sents = sent_vals.sortedKeys()
    #for sent in sorted_sents:
    #    print sent_vals[sent], sent.order, sent.new_par, sent

    sorted_concepts = concept_vals.sortedKeys()
    #for concept in sorted_concepts:
    #    print concept_vals[concept], concept
        
    ## create final concept set
    final_concepts = {}
    for concept in sorted_concepts:
        val = concept_vals[concept]
        #if val < 0.00001: continue
        final_concepts[concept] = val
    final_concept_set = set(final_concepts.keys())

    ## get final sentence list and their concepts
    seen_sents = set()
    for sent in sents:
        skip = False
        if sent.length <= 5: skip = True
        if sent in seen_sents: skip = True
        if sent.order > 0: skip = True
        else: seen_sents.add(sent)
        if skip: sent.concepts = set()
        else: sent.concepts = sent.concepts.intersection(final_concept_set)        
        
    return create_ilp_output(sents, final_concepts, path+id)
Esempio n. 2
0
def query_expand(docs, unit_selector, query):
    ## get sentence set
    sents = []
    for doc in docs:
        #if doc.doctype != 'NEWS STORY': continue
        for sent in doc.sentences:
            ## skip short sentences
            #if sent.length <= 5: continue
            sents.append(sent)
            
    ## initialize sentences with query similarity
    sent_values = prob_util.Counter()
    for sent in sents:
        try: sent_values[sent.original] = sent.sim_basic(query)
        except: sent_values[sent.original] = 1
    sent_values = sent_values.makeProbDist()

    ## get units in each sent
    sent_units = {}
    for sent in sents:
        sent_units[sent.original] = prob_util.Counter()
        units = unit_selector(sent.stemmed)
        for unit in units:
            if text.text_processor.is_just_stopwords(unit): continue
            sent_units[sent.original][unit] += 1

    ## repeat until convergence
    previous_entropy_sent = 0
    previous_entropy_unit = 0
    for iter in range(1, 51):
        prev_sent_values = sent_values.copy()

        ## get new unit values from sent values
        unit_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                unit_values[unit] += sent_values[sent]
        unit_values = unit_values.makeProbDist()

        ## get sent values from unit values
        sent_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                sent_values[sent] += unit_values[unit] #/ len(sent_units[sent])
        sent_values = sent_values.makeProbDist()

        #prob_util.Counter(unit_values).displaySorted(N=5)
        #prob_util.Counter(sent_values).displaySorted(N=20)

        ## check for convergence
        entropy_sent = prob_util.entropy(sent_values)
        entropy_unit = prob_util.entropy(unit_values)
        if entropy_sent >= previous_entropy_sent or entropy_unit >= previous_entropy_unit:
            break
        previous_entropy_sent = entropy_sent
        previous_entropy_unit = entropy_unit
        dist = prob_util.klDistance(prev_sent_values, sent_values)
        sys.stderr.write('%d sent entropy [%1.4f]  unit entropy [%1.4f]  sent dist [%1.6f]\n' %(iter, entropy_sent, entropy_unit, dist))
        #if iter == 2: break
        if dist < 0.0001:
            sys.stderr.write('----------------------------')
            break
        
    return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
Esempio n. 3
0
def map_iterative_sents(docs, unit_selector, query):

    ## get sentence set
    sents = []
    for doc in docs:
        for sent in doc.sentences:
            ## skip short sentences
            #if sent.length <= 5: continue

            ## skip sentences with no query overlap
            if query: sim = sent.sim_basic(query)
            else: sim = 1
            if sim <= 0: continue

            sents.append(sent)
            
    ## initialize uniform sentence priors
    sent_values = prob_util.Counter()
    for sent in sents:
        sent_values[sent.original] = 1
    sent_values = sent_values.makeProbDist()

    ## get units in each sent
    sent_units = {}
    for sent in sents:
        sent_units[sent.original] = prob_util.Counter()
        units = unit_selector(sent.stemmed)
        for unit in units:
            if text.text_processor.is_just_stopwords(unit): continue
            sent_units[sent.original][unit] += 1

    ## repeat until convergence
    for iter in range(1, 51):
        prev_sent_values = sent_values.copy()
        
        ## get unit values from doc values
        unit_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                unit_values[unit] += sent_values[sent]
        unit_values = unit_values.makeProbDist()

        ## get sent values from unit values
        sent_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                sent_values[sent] += unit_values[unit] #/ len(sent_units[sent])
        sent_values = sent_values.makeProbDist()

        #prob_util.Counter(unit_values).displaySorted(N=5)
        #prob_util.Counter(sent_values).displaySorted(N=3)

        ## check for convergence
        entropy_sent = prob_util.entropy(sent_values)
        entropy_unit = prob_util.entropy(unit_values)
        dist = prob_util.klDistance(prev_sent_values, sent_values)
        #print '%d sent entropy [%1.4f]  unit entropy [%1.4f]  sent dist [%1.6f]' %(iter, entropy_sent, entropy_unit, dist)
        if iter == 2: break
        if dist < 0.0001:
            #print '----------------------------'
            break

    return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
Esempio n. 4
0
def make_concepts_exp(id, path, sents, query):
    """
    """

    query_words = set(
        util.porter_stem_sent(
            util.remove_stopwords(util.tokenize(fix_text(query)))).split())

    ## get sentence values
    sent_vals = prob_util.Counter()
    for sent in sents:
        query_overlap = set(util.remove_stopwords(
            sent.tok2.split())).intersection(query_words)
        sent_vals[sent] = max(0, len(query_overlap))
        #if sent.relevance < 0.3: sent_vals[sent] = 0.0
        #else: sent_vals[sent] = 100000**sent.relevance
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = set()
        for concept in concepts:
            if util.is_just_stopwords(concept.split('_')): continue
            sent.concepts.add(concept)
    sent_vals = prob_util.normalize(sent_vals)

    ## get concept values
    concept_vals = prob_util.Counter()
    for sent in sents:
        for concept in sent.concepts:
            concept_vals[concept] += sent_vals[sent]
    concept_vals = prob_util.normalize(concept_vals)

    iter = 0
    while True:
        iter += 1
        se = prob_util.entropy(sent_vals)
        ce = prob_util.entropy(concept_vals)
        print >> sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' % (
            iter, se, ce)
        if iter >= 1: break

        ## get sent vals again
        sent_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                sent_vals[sent] += concept_vals[concept]
        sent_vals = prob_util.normalize(sent_vals)

        ## get concept values
        concept_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                concept_vals[concept] += sent_vals[sent]
        concept_vals = prob_util.normalize(concept_vals)

    sorted_sents = sent_vals.sortedKeys()
    #for sent in sorted_sents:
    #    print sent_vals[sent], sent.order, sent.new_par, sent

    sorted_concepts = concept_vals.sortedKeys()
    #for concept in sorted_concepts:
    #    print concept_vals[concept], concept

    ## create final concept set
    final_concepts = {}
    for concept in sorted_concepts:
        val = concept_vals[concept]
        #if val < 0.00001: continue
        final_concepts[concept] = val
    final_concept_set = set(final_concepts.keys())

    ## get final sentence list and their concepts
    seen_sents = set()
    for sent in sents:
        skip = False
        if sent.length <= 5: skip = True
        if sent in seen_sents: skip = True
        if sent.order > 0: skip = True
        else: seen_sents.add(sent)
        if skip: sent.concepts = set()
        else: sent.concepts = sent.concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)
def query_expand(docs, unit_selector, query):
    ## get sentence set
    sents = []
    for doc in docs:
        #if doc.doctype != 'NEWS STORY': continue
        for sent in doc.sentences:
            ## skip short sentences
            #if sent.length <= 5: continue
            sents.append(sent)
            
    ## initialize sentences with query similarity
    sent_values = prob_util.Counter()
    for sent in sents:
        try: sent_values[sent.original] = sent.sim_basic(query)
        except: sent_values[sent.original] = 1
    sent_values = sent_values.makeProbDist()
    original_sent_values = sent_values.copy()

    ## get units in each sent
    sent_units = {}
    for sent in sents:
        sent_units[sent.original] = prob_util.Counter()
        units = unit_selector(sent.stemmed)
        for unit in units:
            if text.text_processor.is_just_stopwords(unit): continue
            sent_units[sent.original][unit] += 1

    ## repeat until convergence
    prev_unit_entropy = 0
    prev_sent_entropy = 0
    prev_unit_values = {}
    prev_sent_values = {}
    for iter in range(1, 51):
        prev_sent_values = sent_values.copy()

        ## get new unit values from sent values
        unit_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                unit_values[unit] += sent_values[sent]
        unit_values = unit_values.makeProbDist()

        ## get sent values from unit values
        sent_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                sent_values[sent] += unit_values[unit] #/ len(sent_units[sent])
        sent_values = sent_values.makeProbDist()

        ## interpolate with original sent weights
        sent_prior = 0.1
        for sent in sent_values:
            new_value = (sent_prior * original_sent_values[sent]) + ( (1-sent_prior) * sent_values[sent])
            #sent_values[sent] = new_value

        #prob_util.Counter(unit_values).displaySorted(N=100)
        #prob_util.Counter(sent_values).displaySorted(N=20)

        ## check for convergence
        entropy_sent = prob_util.entropy(sent_values)
        entropy_unit = prob_util.entropy(unit_values)
        dist = prob_util.klDistance(prev_sent_values, sent_values)
        sys.stderr.write('%d sent entropy [%1.4f]  unit entropy [%1.4f]  sent dist [%1.6f]\n' %(iter, entropy_sent, entropy_unit, dist))
        
        if iter == 2: break
        
        if (entropy_unit >= prev_unit_entropy) and (entropy_sent >= prev_sent_entropy): 
            unit_values = prev_unit_values
            sent_values = prev_sent_values
            break
        
        prev_unit_entropy = entropy_unit
        prev_sent_entropy = entropy_sent
        prev_unit_values = unit_values
        prev_sent_values = sent_values
        
        if dist < 0.0001: break

    #prob_util.Counter(unit_values).displaySorted(N=10)
    #prob_util.Counter(sent_values).displaySorted(N=20)

    return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
def map_iterative_sents(docs, unit_selector, query):

    ## get sentence set
    sents = []
    for doc in docs:
        for sent in doc.sentences:
            ## skip short sentences
            #if sent.length <= 5: continue

            ## skip sentences with no query overlap
            if query: sim = sent.sim_basic(query)
            else: sim = 1
            if sim <= 0: continue

            sents.append(sent)
            
    ## initialize uniform sentence priors
    sent_values = prob_util.Counter()
    for sent in sents:
        sent_values[sent.original] = 1
    sent_values = sent_values.makeProbDist()

    ## get units in each sent
    sent_units = {}
    for sent in sents:
        sent_units[sent.original] = prob_util.Counter()
        units = unit_selector(sent.stemmed)
        for unit in units:
            if text.text_processor.is_just_stopwords(unit): continue
            sent_units[sent.original][unit] += 1

    ## repeat until convergence
    for iter in range(1, 51):
        prev_sent_values = sent_values.copy()
        
        ## get unit values from doc values
        unit_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                unit_values[unit] += sent_values[sent]
        unit_values = unit_values.makeProbDist()

        ## get sent values from unit values
        sent_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                sent_values[sent] += unit_values[unit] #/ len(sent_units[sent])
        sent_values = sent_values.makeProbDist()

        #prob_util.Counter(unit_values).displaySorted(N=5)
        #prob_util.Counter(sent_values).displaySorted(N=3)

        ## check for convergence
        entropy_sent = prob_util.entropy(sent_values)
        entropy_unit = prob_util.entropy(unit_values)
        dist = prob_util.klDistance(prev_sent_values, sent_values)
        #print '%d sent entropy [%1.4f]  unit entropy [%1.4f]  sent dist [%1.6f]' %(iter, entropy_sent, entropy_unit, dist)
        if iter == 2: break
        if dist < 0.0001:
            #print '----------------------------'
            break

    return prob_util.Counter(unit_values), prob_util.Counter(sent_values)