Esempio n. 1
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    data_directory = '../../experiments/ptr/'

    # Load data
    logging.info('Loading ptr_data')
    with open(os.path.join(data_directory, 'ptr_data.p')) as f:
        ptr_data = cPickle.load(f)

    logging.info('Loading ptr_params')
    with open(os.path.join(data_directory, 'ptr_params.p')) as f:
        ptr_params = cPickle.load(f)

    pfst_params = PTRExperiment.PFSTParams(ptr_data, ptr_params)

    # Copy new parameters
    #logging.info('Creating new_ptr_params')
    #new_ptr_params = copy.deepcopy(ptr_params)
    #doc_ct = 0

    logging.info('Loading sentence counter')
    with open(os.path.join(data_directory, 'ptr_sentence_counter.p')) as f:
        sentence_counter = cPickle.load(f)

    sorted_sentence_counts = sorted(sentence_counter.items(), key=lambda x:x[1], reverse=True)
    del sentence_counter

    alignment_library = {}
    assignment_library = {}


    
    # Inference
    logging.info('Starting inference')
    sentence_ct = 0
    for wordids, count in sorted_sentence_counts:
        sentence_ct += 1
        if sentence_ct % 1000 == 0:
            logging.info('%s sentences done, current count: %s' % (sentence_ct, count))
            with open(os.path.join(data_directory, 'ptr_assignment_library.p'), 'w') as f:
                cPickle.dump(assignment_library, f)

            with open(os.path.join(data_directory, 'ptr_alignment_library.p'), 'w') as f:
                cPickle.dump(alignment_library, f)
        
        if wordids in assignment_library:
            pass
        elif wordids in ptr_params.ideas.wordids_idx_dict:
            assignment_library[wordids] = ptr_params.ideas.wordids_idx_dict[wordids]
        else:
            top_ideas = metrics.get_top_candidates(wordids, ptr_params, num_top_candidates=20, jaccard_cutoff=0.5)
            assignment = metrics.get_assignment(wordids, top_ideas, ptr_data, ptr_params, pfst_params, alignment_library)
            assignment_library[wordids] = assignment

    logging.info('Writing new_ptr_params')
    with open(os.path.join(data_directory, 'ptr_assignment_library.p', 'w')) as f:
        cPickle.dump(assignment_library, f)
Esempio n. 2
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    data_directory = '../../experiments/ptr/'

    # Load data
    logging.info('Loading ptr_data')
    with open(os.path.join(data_directory, 'ptr_data.p')) as f:
        ptr_data = cPickle.load(f)

    logging.info('Loading ptr_params')
    with open(os.path.join(data_directory, 'ptr_params.p')) as f:
        ptr_params = cPickle.load(f)

    pfst_params = PTRExperiment.PFSTParams(ptr_data, ptr_params)

    # Copy new parameters
    logging.info('Creating new_ptr_params')
    new_ptr_params = copy.deepcopy(ptr_params)
    doc_ct = 0

    logging.info('Loading assignment and alignment libraries')
    with open(os.path.join(data_directory, 'ptr_assignment_library.p')) as f:
        assignment_library = cPickle.dump(assignment_library, f)

    with open(os.path.join(data_directory, 'ptr_alignment_library.p')) as f:
        alignment_library = cPickle.dump(alignment_library, f)


    # Inference
    logging.info('Starting inference')
    for docid, wordids in ptr_data.docid_wordids.iteritems():
        doc_ct += 1
        if doc_ct % 1000 == 0:
            logging.info('%s documents done' % doc_ct)
            #with open(os.path.join(data_directory, 'ptr_params.greedy.001.p', 'w')) as f:
            #    cPickle.dump(new_ptr_params, f)

        for p, assignment in ptr_params.docid_partitions[docid].iteritems():
            
            sent_wordids = tuple(wordids[p[0]:p[1]])

            if sent_wordids in new_ptr_params.ideas.wordids_idx_dict:
                new_ptr_params.docid_partitions[docid][p] = new_ptr_params.ideas.wordids_idx_dict[sent_wordids]
            else:
                top_ideas = metrics.get_top_candidates(sent_wordids, new_ptr_params, num_top_candidates=20, jaccard_cutoff=0.5)
                assignment = metrics.get_assignment(sent_wordids, top_ideas, ptr_data, new_ptr_params, pfst_params, alignment_library)
                new_ptr_params.docid_partitions[docid][p] = assignment

    logging.info('Writing new_ptr_params')
    with open(os.path.join(data_directory, 'ptr_params.greedy.001.p', 'w')) as f:
        cPickle.dump(new_ptr_params, f)