def generic_enumerators_top_k(dataset,attributes,configuration,user1_scope,user2_scope,votes_attributes,users_attributes,position_attribute,closed=True): #disagreement
    spent=0
    visited_patterns_more_detailed=set()
    nb_quality_measured=0
    interesting_patterns=[]
    len_attributes=len(attributes)
    range_len_attributes=range(len_attributes)
    top_k=configuration.get('top_k',float('inf'))
    if top_k is None : 
        top_k=float('inf')
    quality_threshold=float(configuration.get('quality_threshold',0))
    if quality_threshold is None:
        quality_threshold=0
    
    pruning = configuration.get('pruning',False)
    iwant=configuration['iwant']
    nb_dossiers_min=configuration['nb_dossiers_min']
    cover_threshold=configuration['cover_threshold']
    threshold_pair_comparaison=float(configuration['threshold_pair_comparaison'])
    comparaison_measure=configuration['comparaison_measure']
    
    subpattern_votes_filter=[True if attr['name'] in votes_attributes else False for attr in attributes ]
    #subpattern_users_filter=[True if attr['name'] in users_attributes else False for attr in attributes ]
    
    vote_id_attributes=votes_attributes[0]
    users_id_attributes=users_attributes[0]
     
    patterns_visited_valid={}
    ############################### GET DISTINCT VALUES ####################################
    #datasetFiltered=filter_pipeline_obj(dataset, user2_scope)[0]
    user2_scope_values={x[users_id_attributes] for x in filter_pipeline_obj(dataset, user2_scope)[0]}
    #print user2_scope_values
    votes_map_details,votes_map_meps,users_map_details,users_map_votes,arr_distinct_values=get_distinct_values(dataset,attributes,votes_attributes,users_attributes,position_attribute,user2_scope_values)
    votes_map_details_array=votes_map_details.values()    
    users_map_details_array=users_map_details.values()
    ##############################################################################################
    
    users_map_details_array_filtered_user1=filter_pipeline_obj(users_map_details_array, user1_scope)[0]
    users_map_details_array_filtered_user2=filter_pipeline_obj(users_map_details_array, user2_scope)[0]
    users_ids=set([obj[users_id_attributes] for obj in users_map_details_array])
    users1_ids=set([p_attr[users_id_attributes] for p_attr in users_map_details_array_filtered_user1])
    users2_ids=set([p_attr[users_id_attributes] for p_attr in users_map_details_array_filtered_user2])
    nb_users_voted=len(users1_ids)+len(users2_ids)
    
    ############################STATS for * ######################################
    
    original_mepsStatistics,original_mepsStatsNumbers,original_mepsMeta = extractStatistics_fromdataset_vectors(dataset,votes_attributes,users_attributes,position_attribute,user1_scope,user2_scope)
    
    original_mepwise_similarities={}
    for user1 in original_mepsStatsNumbers:
        original_mepwise_similarities[user1]={}
        for user2 in original_mepsStatsNumbers[user1]:
            original_mepwise_similarities[user1][user2]=get_sim_vectors(original_mepsStatsNumbers,user1,user2,comparaison_measure)
                
    ##############################################################################
    
    arr_data,arr_types,arr_depthmax,arr_refinement_indexes,arr_labels,subgroup_pipeline,filter_operations,subgroup_pipeline_for_votes,subgroup_pipeline_for_meps = get_arrdata_from_dataset_values(arr_distinct_values, attributes,votes_attributes,users_attributes,position_attribute)
    configuration['stats']=original_mepsStatistics
    configuration['skip_list']=[]
    configuration['closing']=[None]*len(attributes)
    #configuration['closing_refin']=[None]*len(attributes)
    
    configuration['closing_refin']=[([''],None)]*len(attributes)
    
    enumerator=generic_enumerator_multiattributes_dfs(arr_data,arr_types,arr_refinement_indexes,arr_depthmax,configuration,bitwise=None,stats=original_mepsStatistics)
    index,index_has_been_visited,index_frequent,index_valid,index_non_valid,index_good=0,0,0,0,0,0
    
    ##################################TO FAST PROCESS THEMES#################################
    for attr in attributes:
        if attr['type'] in ['themes','themes2']:
            dimensionName=attr['name']
            for obj in votes_map_details_array:
                s2_arr=[]
                obj_idlabels=set()
                s2_arr_extend=s2_arr.extend
                for val in obj[dimensionName]: 
                    v_theme_id=val[:val.index(' ')]
                    #print v_theme_id
                    obj_idlabels|={v_theme_id}
                    v=v_theme_id.split('.')
                    s2_arr_extend(['.'.join(v[0:x+1]) for x in range(len(v))])
                s2=set(s2_arr)
                obj[dimensionName+'0']=obj_idlabels#obj[dimensionName][:]
                if obj[dimensionName]==[]:
                    obj[dimensionName+'0']=[' ']
                obj[dimensionName]=s2
    ##################################TO FAST PROCESS THEMES#################################    
    
    for p,bitwise_p,stats in enumerator: 
        #print p
        zip_pvote_filter=zip(p,subpattern_votes_filter)
        p_vote=[];p_vote_append=p_vote.append
        p_mep=[];p_mep_append=p_mep.append

        
        
        tuple_p=tuple();attr_ind=0
        for p_attr,y in zip_pvote_filter:
            if y:
                p_vote_append(p_attr)
            else:
                p_mep_append(p_attr)
            tuple_p+=(tuple(p_attr),)
            subgroup_pipeline[attr_ind][filter_operations[attr_ind]]=p_attr
            attr_ind+=1
        
        index+=1
    
        #################################COMPUTE SUBGROUP et CHECK IF IT IS A FREQUENT PATTERN##########################"
        
        filteredDataset_votes,bitwise = filter_pipeline_obj(votes_map_details_array, subgroup_pipeline_for_votes, bitwise_p) 
        nb_votes=len(filteredDataset_votes)
        #print p,set().union(*[obj[attributes[0]['name']] for obj in filteredDataset_votes])
#         raw_input('...')
        
        
        configuration['bitwise']=bitwise
        users_map_details_array_filtered_user2_pattern,unused_bitwise=filter_pipeline_obj(users_map_details_array_filtered_user2, subgroup_pipeline_for_meps) 
        users2_ids_pattern=set([p_attr[users_id_attributes] for p_attr in users_map_details_array_filtered_user2_pattern])    
        
        v_ids=set()
        dossiers_ids=set()
        #8479     4934     3733     3514     176
        if nb_votes<=threshold_pair_comparaison :
            configuration['skip_list']=[p]
            if nb_votes<threshold_pair_comparaison :
                continue
        
        
        for obj in filteredDataset_votes:
            v_ids |= {obj[vote_id_attributes]}
            dossiers_ids |= {obj['DOSSIERID']}
        
        nb_dossiers=len(dossiers_ids)
        
        if nb_dossiers<=nb_dossiers_min :
            configuration['skip_list']=[p]
            if nb_dossiers<nb_dossiers_min:
                continue
             
        p_more_detailed=p
        #tuple_p_more_detailed=tuple_p
        ################CLOSED###################
        start=time()
        if closed:
            p_more_detailed=[];p_more_detailed_append=p_more_detailed.append
            arr_data_detailed=[];arr_data_detailed_append= arr_data_detailed.append
            arr_refin_detailed=[];arr_refin_detailed_append=arr_refin_detailed.append
            tuple_p_more_detailed=tuple()
            for i in range_len_attributes:
                attr=attributes[i];
                attr_type=attr['type']
                attr_name=attr['name'] if attr_type not in ['themes','themes2'] else attr['name']+'0'
                depth_max=attr['bound_width']
                subp=p[i]
                values=[x[attr_name] for x in filteredDataset_votes]
                min_p,min_arr,min_rein=POSSIBLE_ENUMERATOR_MINIMAL_DESCRIPTION[attr_type](values)
                min_p = subp if min_p is None else min_p
                
                p_more_detailed_append(min_p)
#                 if depth_max>len(min_p):
#                     arr_data_detailed_append(min_arr)
#                     arr_refin_detailed_append(subp)
#                 else :

                toContinueFrom=closure_continue_from(subp,min_p)
                #print subp,'-',min_p,'-',toContinueFrom
                arr_data_detailed_append(min_arr)
                arr_refin_detailed_append((toContinueFrom,min_rein))
                tuple_p_more_detailed+=(tuple(p_more_detailed[-1]),)
            
            configuration['closing']=arr_data_detailed
            configuration['closing_refin']=arr_refin_detailed       
            
            
            
            if not all (POSSIBLE_ENUMERATOR_RESPECT_ORDER[attributes[i]['type']](p[i],p_more_detailed[i]) for i in range_len_attributes):
                #print p,p_more_detailed
                spent+=time()-start
                configuration['skip_list']=[p]
                continue
            
#             if closed and all(p[i]==p_more_detailed[i] for i in range_len_attributes  if attributes[i]['type']<>'themes' ) and not all(respect_order(p[i],p_more_detailed[i]) for i in range_len_attributes  if attributes[i]['type']=='themes' ):
#                 if tuple_p_more_detailed in visited_patterns_more_detailed:
#                     spent+=time()-start
#                     configuration['skip_list']=[p]
#                     continue
#                 visited_patterns_more_detailed|={tuple_p_more_detailed}
                
             
        spent+=time()-start
        ################################################
        

        
        filteredDataset_meps_votes={}
        users_ids_set=set()
        max_votes_pairwise=0
        for key in users_ids:
            value=users_map_votes[key]
            votes_user=(value & v_ids)
            len_votes_user=len(votes_user)
            if len_votes_user>=threshold_pair_comparaison:
                filteredDataset_meps_votes[key]=votes_user
                users_ids_set|={key}
                max_votes_pairwise=len_votes_user if len_votes_user>max_votes_pairwise else max_votes_pairwise
        
        
        if max_votes_pairwise<=threshold_pair_comparaison :
            configuration['skip_list']=[p]
            if max_votes_pairwise<threshold_pair_comparaison :
                continue
        
            
        users1_ids_set=users1_ids & users_ids_set
        users2_ids_set=users2_ids & users_ids_set & users2_ids_pattern
        ###################NEEEEEW#############################
        nb_users_voted=len(users1_ids)+len(users2_ids_pattern)
        ###################NEEEEEW#############################
        
        #max_votes_pairwise=max(len(value) for key,value in filteredDataset_meps_votes.iteritems())
        
        index_frequent+=1
        
        ##########################################################################################
        
        
        ######################################COVER COMPUTING######################################
        
        if closed:
            if tuple_p_more_detailed in visited_patterns_more_detailed:
                spent+=time()-start
                #configuration['skip_list']=[p]
                index_has_been_visited+=1
                continue
            visited_patterns_more_detailed|={tuple_p_more_detailed}
                
        if not closed or cover_threshold<1.:        
            max_cover,b_max_cover,pattern_max,bitarray_bitwise,bitarray_bitwise_count=cover_max_computation_new(p,p_vote,p_mep, bitwise, patterns_visited_valid, cover_threshold)
            
            if (max_cover>=cover_threshold):
                index_has_been_visited+=1
                if (pruning and b_max_cover<quality_threshold):
                    configuration['skip_list']=[p]
                continue
        
#         if tuple_p_more_detailed in visited_patterns_more_detailed:
#             continue
#         visited_patterns_more_detailed|={tuple_p_more_detailed}
        
        
        
#         max_cover_dossier=0
#         for ancient_dossiers_visited in dossiers_ids_visited:
#             
#             max_cover_dossier= max(max_cover_dossier,len(ancient_dossiers_visited & dossiers_ids) / float(len(ancient_dossiers_visited)))
#         if (max_cover_dossier>0.3):
#             continue
        
        #########################################################################################################

        
        returned_mepsStatistics,returned_mepsStatsNumbers,returned_mepsMeta = extractStatistics_fromdataset_new_update_not_square_vectors(stats, original_mepsMeta, votes_attributes, users_attributes, position_attribute, v_ids, users_ids_set,users1_ids_set,users2_ids_set)
        configuration['stats']=returned_mepsStatistics
        issquare=(users1_ids_set==users2_ids_set)
        #print issquare
        ######################################UPPER BOUND EXPECTED COMPUTING######################################
        
        original_model,pattern_model=compute_models(original_mepwise_similarities,returned_mepsStatsNumbers,comparaison_measure,issquare)
        quality,borne_max_quality=compute_quality(original_model,pattern_model,threshold_pair_comparaison,nb_users_voted,iwant)
  
        nb_quality_measured+=1
        ######################################UPPER BOUND EXPECTED COMPUTING######################################
        
        index_non_valid+=1
 
        if (pruning and borne_max_quality<quality_threshold):
            configuration['skip_list']=[p]
            continue

        index_valid+=1
        
        label = [POSSIBLE_ENUMERATOR_DESCRIPTION_YIELDER[arr_types[attr_ind]](p_more_detailed[attr_ind],arr_labels[attr_ind]) for attr_ind in range(len(p_more_detailed))]
        #print p,'\t',nb_dossiers,'\t',len(v_ids),'\t',max_cover,'\t',quality,'\t',borne_max_quality,'\t',quality>=quality_threshold,'\t',index,'\t',index_frequent,'\t',index_non_valid,'\t',index_valid,'\t',index_good
        print p_more_detailed,'\t',quality,'\t',borne_max_quality,'\t',nb_votes # parent_vote,'\t',parent_mep,'\t',
        #raw_input('...')

        if (quality>=quality_threshold):
            if not closed or cover_threshold<1.:
                patterns_visited_valid[tuple_p]={'pattern':p,'dossiers':dossiers_ids,'bitwise':bitarray_bitwise,'bitwise_count':bitarray_bitwise_count,'upper_bound':borne_max_quality,'pattern_vote':p_vote,'pattern_user':p_mep}
            
            dataset_stats=datasetStatistics(returned_mepsStatsNumbers, returned_mepsMeta,votes_attributes,users_attributes,position_attribute)
            
            index_good+=1
            dossiers_voted=sorted([p_attr for p_attr in set([(v['PROCEDURE_TITLE'],float('%.2f' % ((sum([1 for p_attr in filteredDataset_votes if p_attr['PROCEDURE_TITLE']==v['PROCEDURE_TITLE']])/float(len(v_ids)))*100) ),sum([1 for p_attr in filteredDataset_votes if p_attr['PROCEDURE_TITLE']==v['PROCEDURE_TITLE']])) for v in filteredDataset_votes])],key=lambda p_attr : p_attr[1],reverse=True)
            interesting_patterns.append([p_more_detailed,label,dataset_stats,quality,borne_max_quality,dossiers_voted])
            if len(interesting_patterns)>top_k:
                interesting_patterns=sorted(interesting_patterns,key=lambda p_attr : p_attr [3],reverse=True)[:top_k]
                quality_threshold=interesting_patterns[-1][3]
    
    print index,'\t',index_has_been_visited,'\t',index_frequent,'\t',index_non_valid,'\t',index_valid,'\t',index_good
    print 'TIMESPENT = ',spent
    for p,label,dataset_stats,quality,borne_max_quality,dossiers_voted in sorted(interesting_patterns,key=lambda p_attr : p_attr [3],reverse=True):
        yield p,label,dataset_stats,quality,borne_max_quality,dossiers_voted
def process_outcome_dataset(itemsfile,
                            usersfile,
                            outcomesfile,
                            numeric_attrs=[],
                            array_attrs=[],
                            outcome_attrs=None,
                            method_aggregation_outcome='VECTOR_VALUES',
                            itemsScope=[],
                            contexts_scope=[],
                            users_1_Scope=[],
                            users_2_Scope=[],
                            nb_items=float('inf'),
                            nb_individuals=float('inf'),
                            attributes_to_consider=None,
                            nb_items_entities=float('inf'),
                            nb_items_individuals=float('inf'),
                            hmt_to_itemset=False,
                            delimiter='\t'):

    FULL_OUTCOME_CONSIDERED = False
    ITEMS_METADATA_NEEDED = False
    USE_CACHE = False
    SIZE_ESTIMATION = False
    SMALLER_DESCRIPTION_SPACE = True
    VERBOSE = False
    CLARIFY = True
    to_consider_ids_in_contexts_scope = None
    nb_outcome_considered = 0

    if 'CACHE' in dir(process_outcome_dataset):
        items, items_header, users, users_header, outcomes, outcomes_header, items_id, users_id, outcome_attrs, position_attr, outcomes_processed, considered_items, users1, users2, (
            considered_items_ids), (considered_users_1_ids), (
                considered_users_2_ids), (
                    considered_users_ids) = process_outcome_dataset.CACHE
    else:
        items, items_header = readCSVwithHeader(itemsfile,
                                                numberHeader=numeric_attrs,
                                                arrayHeader=array_attrs,
                                                selectedHeader=None,
                                                delimiter=delimiter)

        users, users_header = readCSVwithHeader(usersfile,
                                                numberHeader=numeric_attrs,
                                                arrayHeader=array_attrs,
                                                selectedHeader=None,
                                                delimiter=delimiter)
        outcomes, outcomes_header = readCSVwithHeader(
            outcomesfile,
            numberHeader=numeric_attrs,
            arrayHeader=array_attrs,
            selectedHeader=None,
            delimiter=delimiter)
        items_id = items_header[0]
        users_id = users_header[0]

        outcome_attrs = outcome_attrs if outcome_attrs is not None else [
            outcomes_header[2]
        ]
        position_attr = outcome_attrs[0]
        outcomes_processed, vector_of_action = outcome_representation_in_reviews(
            outcomes, position_attr, outcome_attrs, method_aggregation_outcome)

        considered_items = filter_pipeline_obj(items, itemsScope)[0]
        considered_items_in_contexts_scope = filter_pipeline_obj(
            items, contexts_scope)[0]

        users1 = filter_pipeline_obj(users, users_1_Scope)[0]
        users2 = filter_pipeline_obj(users, users_2_Scope)[0]

        get_items_ids = partial(map, itemgetter(items_id))
        get_users_ids = partial(map, itemgetter(users_id))

        considered_items_ids = set(get_items_ids(considered_items))
        to_consider_ids_in_contexts_scope = set(
            get_items_ids(considered_items_in_contexts_scope))
        considered_users_1_ids = set(get_users_ids(users1))
        considered_users_2_ids = set(get_users_ids(users2))
        considered_users_ids = set(considered_users_1_ids) | set(
            considered_users_2_ids)
        if USE_CACHE:
            process_outcome_dataset.CACHE = [
                items, items_header, users, users_header, outcomes,
                outcomes_header, items_id, users_id, outcome_attrs,
                position_attr, outcomes_processed, considered_items[:],
                users1[:], users2[:],
                set(considered_items_ids),
                set(considered_users_1_ids),
                set(considered_users_2_ids),
                set(considered_users_ids)
            ]

    if nb_items < float('inf'):
        nb_items = int(nb_items)
        considered_items_ids = set(sorted(considered_items_ids)[:nb_items])
        considered_items = [
            x for x in considered_items if x[items_id] in considered_items_ids
        ]
    if nb_individuals < float('inf'):
        nb_individuals = int(nb_individuals)
        considered_users_ids = set(
            sorted(considered_users_ids)[:nb_individuals])
        users1 = [x for x in users1 if x[users_id] in considered_users_ids]
        users2 = [x for x in users2 if x[users_id] in considered_users_ids]

    all_users_to_items_outcomes = {}
    outcomes_considered = []
    outcomes_considered_append = outcomes_considered.append
    items_metadata = {
        row[items_id]: row
        for row in considered_items if row[items_id] in considered_items_ids
    } if ITEMS_METADATA_NEEDED else {}
    users_metadata = {
        row[users_id]: row
        for row in users if row[users_id] in considered_users_ids
    }

    for row in outcomes_processed:
        v_id_rev = row[items_id]
        u_id_rev = row[users_id]
        if v_id_rev in considered_items_ids and u_id_rev in considered_users_ids:
            pos_rev = row[position_attr]
            if u_id_rev not in all_users_to_items_outcomes:
                all_users_to_items_outcomes[u_id_rev] = {}
            all_users_to_items_outcomes[u_id_rev][v_id_rev] = pos_rev
            if FULL_OUTCOME_CONSIDERED:
                outcomes_considered_append({
                    items_id: v_id_rev,
                    users_id: u_id_rev,
                    position_attr: pos_rev
                })

            nb_outcome_considered += 1

    considered_users_1_sorted = sorted(users1, key=itemgetter(users_id))
    considered_users_2_sorted = sorted(users2, key=itemgetter(users_id))
    considered_items_sorted = sorted(considered_items,
                                     key=itemgetter(items_id))
    if SIZE_ESTIMATION:
        from pympler.asizeof import asizeof
        print asizeof(all_users_to_items_outcomes)
        print asizeof(considered_items_sorted)
        print asizeof(considered_users_1_sorted)
        print asizeof(considered_users_2_sorted)
    gc.collect()

    if SMALLER_DESCRIPTION_SPACE:
        NB_SELECTED_ITEMSET_ENTITIES = nb_items_entities  #100
        NB_SELECTED_ITEMSET_INDIVIDUALS = nb_items_individuals  #100
        from enumerator.enumerator_attribute_complex import init_attributes_complex, create_index_complex
        from enumerator.enumerator_attribute_themes2 import tree_leafs, tree_remove

        #####################################ENTITIES - DESCRIPTION SPACE LIMITING##########################################
        concerned_attributes_entities_numeric = sorted(
            set(attributes_to_consider) & set(numeric_attrs)
            & set(items_header))
        concerned_attributes_entities_hmt = sorted(
            set(attributes_to_consider) & set(array_attrs) & set(items_header))
        concerned_attributes_entities_categorical = sorted(
            set(attributes_to_consider) & set(items_header) -
            (set(concerned_attributes_entities_hmt)
             | set(concerned_attributes_entities_numeric)))
        attributes_plain = [{
            'name': a,
            'type': 'themes'
        } for a in concerned_attributes_entities_hmt] + [{
            'name': a,
            'type': 'numeric'
        } for a in concerned_attributes_entities_numeric] + [{
            'name': a,
            'type': 'simple'
        } for a in concerned_attributes_entities_categorical]
        attributes = [{
            'name': a,
            'type': 'themes'
        } for a in concerned_attributes_entities_hmt] + [{
            'name': a,
            'type': 'numeric'
        } for a in concerned_attributes_entities_numeric] + [{
            'name': a,
            'type': 'simple'
        } for a in concerned_attributes_entities_categorical]
        attributes = init_attributes_complex(considered_items_sorted,
                                             attributes)  #X
        index_all = create_index_complex(considered_items_sorted,
                                         attributes)  #Y
        nb_itemsets_all = 0
        for attr in attributes:
            if attr['type'] in {'numeric'}:
                nb_itemsets_all += 2 * len(attr['domain'])
                attr['nb_items'] = 2 * len(attr['domain'])
            else:
                nb_itemsets_all += len(attr['domain'])
                attr['nb_items'] = len(attr['domain'])

        if NB_SELECTED_ITEMSET_ENTITIES <= 1:
            NB_SELECTED_ITEMSET_ENTITIES = int(nb_itemsets_all *
                                               NB_SELECTED_ITEMSET_ENTITIES)
            print "NB_SELECTED_ITEMSET_ENTITIES AFTER RATIO = ", NB_SELECTED_ITEMSET_ENTITIES

        nb_itemsets_all_context = NB_SELECTED_ITEMSET_ENTITIES

        nb_itemsets_to_remove = max(
            0, nb_itemsets_all - NB_SELECTED_ITEMSET_ENTITIES)
        if nb_itemsets_to_remove > 0:
            if VERBOSE:
                print 'Entities Search Space Modification : ', NB_SELECTED_ITEMSET_ENTITIES
            factor = NB_SELECTED_ITEMSET_ENTITIES / float(nb_itemsets_all)
            #print factor
            for attr in attributes:
                attr['nb_items_new'] = int(round(factor * attr['nb_items']))
                attr['nb_items_to_remove'] = attr['nb_items'] - attr[
                    'nb_items_new']
                attr_name = attr['name']
                attr_type = attr['type']
                attr_domain = attr['domain']
                attr_labelmap = attr['labelmap']
                if attr_type == 'themes':
                    tags_removed = set(
                        tree_remove(attr_domain, attr['nb_items_to_remove']))
                    tags_keeped = set(attr_domain) - tags_removed
                    for i, o in enumerate(index_all):
                        o[attr_name] = o[attr_name] & tags_keeped
                        considered_items_sorted[i][attr_name] = [
                            attr_labelmap[t] if t != '' else t
                            for t in sorted(o[attr_name])
                        ]
                elif attr_type == 'numeric':
                    values_to_remove = int(attr['nb_items_to_remove'] / 2)
                    values_to_keep = len(attr['domain']) - values_to_remove
                    if values_to_keep == 0:
                        values_to_keep = 1
                    new_attr_domain = [
                        attr_domain[int(
                            round((k / float(values_to_keep)) *
                                  len(attr_domain)))]
                        for k in range(values_to_keep)
                    ]
                    mapto = {
                        x:
                        new_attr_domain[bisect_right(new_attr_domain, x) - 1]
                        for x in attr_domain
                    }

                    for i, o in enumerate(index_all):
                        o[attr_name] = mapto[o[attr_name]]
                        considered_items_sorted[i][attr_name] = o[attr_name]
                elif attr_type == 'simple':
                    if attr['nb_items_to_remove'] >= len(attr['domain']):
                        attr['nb_items_to_remove'] = -1
                    groupOfValues = 'From ' + attr['domain'][
                        0] + ' To ' + attr['domain'][
                            attr['nb_items_to_remove']]
                    new_attr_domain = [
                        groupOfValues
                    ] + attr['domain'][attr['nb_items_to_remove'] + 1:]
                    mapto = {
                        x: x if x in new_attr_domain else groupOfValues
                        for x in attr_domain
                    }
                    for i, o in enumerate(index_all):
                        o[attr_name] = mapto[o[attr_name]]
                        considered_items_sorted[i][attr_name] = o[attr_name]
        else:
            if VERBOSE:
                print 'no Entities Search Space Modification'
    #####################################ENTITIES - DESCRIPTION SPACE LIMITING##########################################

    #####################################INDIVIDUALS - DESCRIPTION SPACE LIMITING##########################################
        concerned_attributes_individuals_numeric = sorted(
            set(attributes_to_consider) & set(numeric_attrs)
            & set(users_header))
        concerned_attributes_individuals_hmt = sorted(
            set(attributes_to_consider) & set(array_attrs) & set(users_header))
        concerned_attributes_individuals_categorical = sorted(
            set(attributes_to_consider) & set(users_header) -
            (set(concerned_attributes_individuals_hmt)
             | set(concerned_attributes_individuals_numeric)))
        attributes_plain = [{
            'name': a,
            'type': 'themes'
        } for a in concerned_attributes_individuals_hmt] + [{
            'name': a,
            'type': 'numeric'
        } for a in concerned_attributes_individuals_numeric] + [{
            'name':
            a,
            'type':
            'simple'
        } for a in concerned_attributes_individuals_categorical]
        attributes = [{
            'name': a,
            'type': 'themes'
        } for a in concerned_attributes_individuals_hmt] + [{
            'name': a,
            'type': 'numeric'
        } for a in concerned_attributes_individuals_numeric] + [{
            'name':
            a,
            'type':
            'simple'
        } for a in concerned_attributes_individuals_categorical]

        considered_users_sorted = users_metadata.values()

        attributes = init_attributes_complex(considered_users_sorted,
                                             attributes)  #X
        index_all = create_index_complex(considered_users_sorted,
                                         attributes)  #Y
        nb_itemsets_all = 0
        for attr in attributes:
            if attr['type'] in {'numeric'}:
                nb_itemsets_all += 2 * len(attr['domain'])
                attr['nb_items'] = 2 * len(attr['domain'])
            else:
                nb_itemsets_all += len(attr['domain'])
                attr['nb_items'] = len(attr['domain'])

        if NB_SELECTED_ITEMSET_INDIVIDUALS <= 1:
            #print "nb_itemsets_all",nb_itemsets_all,NB_SELECTED_ITEMSET_INDIVIDUALS
            NB_SELECTED_ITEMSET_INDIVIDUALS = int(
                nb_itemsets_all * NB_SELECTED_ITEMSET_INDIVIDUALS)
            print "NB_SELECTED_ITEMSET_INDIVIDUALS AFTER RATIO = ", NB_SELECTED_ITEMSET_INDIVIDUALS

        nb_itemsets_all_individuals = nb_itemsets_all
        nb_itemsets_to_remove = max(
            0, nb_itemsets_all - NB_SELECTED_ITEMSET_INDIVIDUALS)
        if nb_itemsets_to_remove > 0:
            if VERBOSE:
                print 'Individuals Search Space Modification : ', NB_SELECTED_ITEMSET_INDIVIDUALS
            factor = NB_SELECTED_ITEMSET_INDIVIDUALS / float(nb_itemsets_all)
            #print factor
            for attr in attributes:
                attr['nb_items_new'] = int(round(factor * attr['nb_items']))
                attr['nb_items_to_remove'] = attr['nb_items'] - attr[
                    'nb_items_new']
                attr_name = attr['name']
                attr_type = attr['type']
                attr_domain = attr['domain']
                attr_labelmap = attr['labelmap']
                if attr_type == 'themes':
                    tags_removed = set(
                        tree_remove(attr_domain, attr['nb_items_to_remove']))
                    tags_keeped = set(attr_domain) - tags_removed
                    for i, o in enumerate(index_all):
                        o[attr_name] = o[attr_name] & tags_keeped
                        considered_users_sorted[i][attr_name] = [
                            attr_labelmap[t] if t != '' else t
                            for t in sorted(o[attr_name])
                        ]
                elif attr_type == 'numeric':
                    values_to_remove = int(attr['nb_items_to_remove'] / 2)
                    values_to_keep = len(attr['domain']) - values_to_remove
                    if values_to_keep == 0:
                        values_to_keep = 1
                    new_attr_domain = [
                        attr_domain[int(
                            round((k / float(values_to_keep)) *
                                  len(attr_domain)))]
                        for k in range(values_to_keep)
                    ]
                    mapto = {
                        x:
                        new_attr_domain[bisect_right(new_attr_domain, x) - 1]
                        for x in attr_domain
                    }

                    for i, o in enumerate(index_all):
                        o[attr_name] = mapto[o[attr_name]]
                        considered_users_sorted[i][attr_name] = o[attr_name]
                elif attr_type == 'simple':
                    # print len(attr['domain'])
                    # print attr['nb_items_to_remove']
                    # print attr['nb_items_new']
                    if attr['nb_items_to_remove'] >= len(attr['domain']):
                        attr['nb_items_to_remove'] = -1
                    groupOfValues = 'From ' + attr['domain'][
                        0] + ' To ' + attr['domain'][
                            attr['nb_items_to_remove']]
                    new_attr_domain = [
                        groupOfValues
                    ] + attr['domain'][attr['nb_items_to_remove'] + 1:]
                    mapto = {
                        x: x if x in new_attr_domain else groupOfValues
                        for x in attr_domain
                    }
                    for i, o in enumerate(index_all):
                        o[attr_name] = mapto[o[attr_name]]
                        considered_users_sorted[i][attr_name] = o[attr_name]
        else:
            if VERBOSE:
                print 'no Individuals Search Space Modification'

        users1 = [
            x for x in considered_users_sorted
            if x[users_id] in considered_users_1_ids
        ]
        users2 = [
            x for x in considered_users_sorted
            if x[users_id] in considered_users_2_ids
        ]
        considered_users_1_sorted = sorted(users1, key=itemgetter(users_id))
        considered_users_2_sorted = sorted(users2, key=itemgetter(users_id))
        #####################################INDIVIDUALS - DESCRIPTION SPACE LIMITING##########################################

        items_metadata = {
            row[items_id]: row
            for row in considered_items_sorted
            if row[items_id] in considered_items_ids
        } if ITEMS_METADATA_NEEDED else {}
        users_metadata = {
            row[users_id]: row
            for row in considered_users_sorted
            if row[users_id] in considered_users_ids
        }

    # if CLARIFY:
    # 	concerned_attributes_entities_numeric=sorted(set(attributes_to_consider)&set(numeric_attrs)&set(items_header))
    # 	concerned_attributes_entities_hmt=sorted(set(attributes_to_consider)&set(array_attrs)&set(items_header))
    # 	concerned_attributes_entities_categorical=sorted(set(attributes_to_consider)&set(items_header)-(set(concerned_attributes_entities_hmt)|set(concerned_attributes_entities_numeric)))
    # 	attributes=[{'name':a,'type':'themes'} for a in concerned_attributes_entities_hmt]+[{'name':a,'type':'numeric'} for a in concerned_attributes_entities_numeric]+[{'name':a,'type':'simple'} for a in concerned_attributes_entities_categorical]
    # 	clarify_dataset(considered_items_sorted,attributes,items_id)
    #print considered_items_sorted[0]

    TOITEMSET = hmt_to_itemset
    if TOITEMSET:
        print '   '
        print 'Transform HMT to Itemset ...'
        print '   '
    if TOITEMSET:
        concerned_attributes_entities_numeric = sorted(
            set(attributes_to_consider) & set(numeric_attrs)
            & set(items_header))
        concerned_attributes_entities_hmt = sorted(
            set(attributes_to_consider) & set(array_attrs) & set(items_header))
        concerned_attributes_entities_categorical = sorted(
            set(attributes_to_consider) & set(items_header) -
            (set(concerned_attributes_entities_hmt)
             | set(concerned_attributes_entities_numeric)))
        attributes_plain = [{
            'name': a,
            'type': 'themes'
        } for a in concerned_attributes_entities_hmt] + [{
            'name': a,
            'type': 'numeric'
        } for a in concerned_attributes_entities_numeric] + [{
            'name': a,
            'type': 'simple'
        } for a in concerned_attributes_entities_categorical]
        attributes = [{
            'name': a,
            'type': 'themes'
        } for a in concerned_attributes_entities_hmt] + [{
            'name': a,
            'type': 'numeric'
        } for a in concerned_attributes_entities_numeric] + [{
            'name': a,
            'type': 'simple'
        } for a in concerned_attributes_entities_categorical]
        attributes = init_attributes_complex(considered_items_sorted,
                                             attributes)  #X
        index_all = create_index_complex(considered_items_sorted,
                                         attributes)  #Y
        for attr in attributes:
            #attr['nb_items_new']=int(round(factor*attr['nb_items']))
            #attr['nb_items_to_remove']=attr['nb_items']-attr['nb_items_new']
            attr_name = attr['name']
            attr_type = attr['type']
            attr_domain = attr['domain']
            attr_labelmap = attr['labelmap']
            index_tag_items = {
                t: str(i).zfill(5)
                for i, t in enumerate(attr_domain.keys())
            }
            index_tag_items = {
                k: v + " " + attr_labelmap[k].partition(" ")[-1]
                for k, v in index_tag_items.iteritems()
            }
            if attr_type == 'themes':
                for i, o in enumerate(index_all):
                    # print '   '
                    # print considered_items_sorted[i][attr_name]
                    considered_items_sorted[i][attr_name] = [
                        index_tag_items[t] for t in sorted(o[attr_name])
                        if t != ''
                    ]
                    # print considered_items_sorted[i][attr_name]
                    # print '   '
                    # raw_input('....')

    REPLACING_IDS = False
    if REPLACING_IDS:
        ind_to_dict_items = {
            i: x[items_id]
            for i, x in enumerate(considered_items_sorted)
        }

        dict_to_ind_items = {v: k for k, v in ind_to_dict_items.iteritems()}
        considered_items_sorted = [
            dict([(items_id, dict_to_ind_items[x[items_id]])] +
                 [(k, v) for k, v in x.items() if k != items_id])
            for x in considered_items_sorted
        ]
        to_consider_ids_in_contexts_scope = {
            dict_to_ind_items[x]
            for x in to_consider_ids_in_contexts_scope
        }

        ind_to_dict_users = {i: x for i, x in enumerate(users_metadata)}
        dict_to_ind_users = {v: k for k, v in ind_to_dict_users.iteritems()}
        considered_users_1_sorted = [
            dict([(users_id, dict_to_ind_users[x[users_id]])] +
                 [(k, v) for k, v in x.items() if k != users_id])
            for x in considered_users_1_sorted
        ]
        considered_users_2_sorted = [
            dict([(users_id, dict_to_ind_users[x[users_id]])] +
                 [(k, v) for k, v in x.items() if k != users_id])
            for x in considered_users_2_sorted
        ]
        considered_users_sorted = [
            dict([(users_id, dict_to_ind_users[x[users_id]])] +
                 [(k, v) for k, v in x.items() if k != users_id])
            for x in considered_users_sorted
        ]
        all_users_to_items_outcomes = {
            dict_to_ind_users[u]:
            {dict_to_ind_items[v]: o
             for v, o in u_votes.iteritems()}
            for u, u_votes in all_users_to_items_outcomes.iteritems()
        }

        items_metadata = {
            row[items_id]: row
            for row in considered_items_sorted
        }
        users_metadata = {
            dict_to_ind_users[u]:
            dict([(users_id, dict_to_ind_users[x[users_id]])] +
                 [(k, v) for k, v in x.items() if k != users_id])
            for u, x in users_metadata.iteritems()
        }
    # print vector_of_action
    # raw_input('**************')
    else:
        items_metadata = {
            row[items_id]: row
            for row in considered_items_sorted
        }

    process_outcome_dataset.STATS = {
        'nb_items_entities': nb_itemsets_all_context,
        'nb_items_individuals': nb_itemsets_all_individuals
    }

    return items_metadata, users_metadata, all_users_to_items_outcomes, outcomes_considered, items_id, users_id, considered_items_sorted, considered_users_1_sorted, considered_users_2_sorted, nb_outcome_considered, vector_of_action, to_consider_ids_in_contexts_scope
Example #3
0
def Alpha_Entry_Point(itemsFile,usersFile,reviewsFile,numeric_attrs=[],array_attrs=None,outcome_attrs=None,method_aggregation_outcome='VECTOR_VALUES',itemsScope=[],users_1_Scope=[],users_2_Scope=[],delimiter='\t',
	description_attributes_items=[],description_attributes_users=[],
	comparaison_measure='MAAD',qualityMeasure='DISAGR_SUMDIFF',nb_items=float('inf'),nb_individuals=float('inf'),threshold_comparaison=30,threshold_nb_users_1=10,threshold_nb_users_2=10,quality_threshold=0.3,
	ponderation_attribute=None,bound_type=1,pruning=True,closed=True,do_heuristic_contexts=False,do_heuristic_peers=False,timebudget=1000,results_destination='.//results.csv',attributes_to_consider=None,heatmap_for_matrix=False,algorithm='DSC+CLOSED+UB2',nb_items_entities=float('inf'),nb_items_individuals=float('inf'),symmetry=True,nb_random_walks=30,hmt_to_itemset=False,debug=False,verbose=False):
	
	import pickle
	method_aggregation_outcome='SYMBOLIC_MAJORITY'
	inited=time()
	
	# try:
	# 	with open ('.//TMP.tmp', 'rb') as fp:
	# 		items_metadata,users_metadata,all_users_to_items_outcomes,outcomes_considered,items_id_attribute,users_id_attribute,considered_items_sorted,considered_users_1_sorted,considered_users_2_sorted,nb_outcome_considered=pickle.load(fp)
	# except Exception as e:
	print 'computing from zero'
	items_metadata,users_metadata,all_users_to_items_outcomes,outcomes_considered,items_id_attribute,users_id_attribute,considered_items_sorted,considered_users_1_sorted,considered_users_2_sorted,nb_outcome_considered =\
	process_outcome_dataset(itemsFile,usersFile,reviewsFile,numeric_attrs=numeric_attrs,array_attrs=array_attrs,outcome_attrs=outcome_attrs,method_aggregation_outcome=method_aggregation_outcome,itemsScope=itemsScope,users_1_Scope=users_1_Scope,users_2_Scope=users_2_Scope,nb_items=nb_items,nb_individuals=nb_individuals,attributes_to_consider=attributes_to_consider,delimiter=delimiter)
	# with open('.//TMP.tmp', 'wb') as fp:
	# 	pickle.dump([items_metadata,users_metadata,all_users_to_items_outcomes,outcomes_considered,items_id_attribute,users_id_attribute,considered_items_sorted,considered_users_1_sorted,considered_users_2_sorted,nb_outcome_considered], fp)
	
	print considered_items_sorted[0]	
	
	if False:
		considered_items_sorted=[x for x in considered_items_sorted if x[items_id_attribute] not in {'0002','0012','0022','0032','0042','0052','0062','0072'}]
		print len(considered_items_sorted)
		#raw_input('...')
		desc_indiv=[{'dimensionName':'CIGARETTE','inSet':{'Je fume'}},{'dimensionName':'SEXE','inSet':{'M'}},{'dimensionName':'AGE','inInterval':[0,25]}][:1]
		groups_indiv,_=filter_pipeline_obj(considered_users_1_sorted,desc_indiv)
		print len(groups_indiv)
		groups_indiv=set(x[users_id_attribute] for x in groups_indiv)
		set_of_entities=set(x[items_id_attribute] for x in considered_items_sorted)
		all_users_to_items_outcomes={u:{x:y for x,y in v.iteritems() if x in set_of_entities} for u,v in all_users_to_items_outcomes.iteritems()}
		matrix_by_e,matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome=coincidence_matrix(all_users_to_items_outcomes,individuals_set=groups_indiv)
		
		
		full_reliability,full_sums_votes=compute_reliability(matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,set_of_entities,distance_function=distance_numerical)
		print 'reliablitiy',full_reliability

		

		enumerator_contexts=enumerator_complex_cbo_init_new_config(considered_items_sorted, [{'name':'odors', 'type':'themes'}],threshold=1)	
		results=[]
		for e_p,e_label,e_config in enumerator_contexts:
			context_pat=pattern_printer(e_label,['themes'])
			items_context=set(x[items_id_attribute] for x in e_config['support'])

			try:
				context_reliablitiy,_=compute_reliability(matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,items_context,distance_function=distance_numerical,nb_votes_casted_by_outcome_sums=full_sums_votes)
				print e_p,context_reliablitiy
				raw_input('......')
			except Exception as e:
				continue
			results.append([context_pat,context_reliablitiy,items_context])
		results=sorted(results,key=lambda x : x[1],reverse=False)
		to_ret=[]
		for elem in results:
			print elem[0],elem[1]
			to_ret.append({'pattern':elem[0],'reliability':elem[1]})
			#raw_input('...')
	#raw_input('...') 
		#to_ret=[]
	if True:
		frenchdeputies,_=filter_pipeline_obj(considered_users_1_sorted,[{'dimensionName':'COUNTRY','inSet':{'Greece'}}])#[{'dimensionName':'GROUPE_ID','inSet':{'S&D'}}])#[{'dimensionName':'GROUPE_ID','inSet':{'ECR'}}]) #[{'dimensionName':'NATIONAL_PARTY','inSet':{'Parti socialiste'}}]
		frenchdeputies_metadata=sorted(frenchdeputies,key=lambda x:x[users_id_attribute]) #Parti socialiste
		for row in frenchdeputies_metadata:
			row['NATIONAL_PARTY']=unicodedata.normalize('NFD', unicode(str(row['NATIONAL_PARTY']),'iso-8859-1')).encode('ascii', 'ignore')
			row['NAME_FULL']=unicodedata.normalize('NFD', unicode(str(row['NAME_FULL']),'iso-8859-1')).encode('ascii', 'ignore')
		
		frenchdeputies=set(x[users_id_attribute] for x in frenchdeputies)

		print len(frenchdeputies)
		set_of_entities=set(x[items_id_attribute] for x in considered_items_sorted)
		#raw_input('...')
		#print considered_items_sorted[0]
		if True:
			print 'Starting filter'
			all_users_to_items_outcomes={u:{x:y for x,y in v.iteritems() if y[0]!='Abstain'} for u,v in all_users_to_items_outcomes.iteritems()}
			print 'Ending filter'
		#print 'cohesion',compute_cohesion(all_users_to_items_outcomes,set_of_entities,frenchdeputies)
		matrix_by_e,matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome=coincidence_matrix(all_users_to_items_outcomes,individuals_set=frenchdeputies)
		full_reliability,full_sums_votes=compute_reliability(matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,set_of_entities)
		


		print 'reliablitiy',full_reliability
		


		#compute_reliability_with_bootstraping_KILEM_GWET(matrix_by_e,matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,set_of_entities,distance_function=distance_nominal)
		compute_reliability_with_bootstraping_KILEM_GWET_BLB(matrix_by_e,matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,set_of_entities,distance_function=distance_nominal)
		
		# for x,y,z in  enumerator_complex_from_dataset_new_config(considered_items_sorted, [{'name':'VOTEID', 'type':'nominal'}],threshold=5):
		# 	print x
		# 	raw_input('...')
			
		enumerator_contexts=enumerator_complex_cbo_init_new_config(considered_items_sorted, [{'name':'PROCEDURE_SUBJECT', 'type':'themes'}],threshold=5)	
		results=[]
		index=0
		for e_p,e_label,e_config in enumerator_contexts:
			index+=1
			context_pat=pattern_printer(e_label,['themes'])
			items_context=set(x[items_id_attribute] for x in e_config['support'])

			#try:
			context_reliablitiy,_=compute_reliability(matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,items_context,nb_votes_casted_by_outcome_sums=full_sums_votes)
			my_reliability,_=compute_reliability_NEW(matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,items_context,nb_votes_casted_by_outcome_sums=full_sums_votes)
			
			cohesion,vector_full=compute_cohesion(all_users_to_items_outcomes,items_context,frenchdeputies)
			if context_reliablitiy>0.85 and False:
				from heatmap.heatmap import generateHeatMap
				from clusters.hierarchicalClustering import drawDendrogramme,applyHierarchiqueClusteringFromDataset,hierarchicalClusteringFromDataset
				print context_reliablitiy
				

				ref_matrix=similarity_dictionnary(all_users_to_items_outcomes,frenchdeputies,set_of_entities)
				rower=[x['NATIONAL_PARTY']+'_'+x['NAME_FULL']+'_'+str(x[users_id_attribute]) for x in frenchdeputies_metadata]
				head=[['']]+rower
				ref_matrix_array=[[ref_matrix[u1][u2] if u1<u2 else ref_matrix[u2][u1] if u1>u2 else float(1.)  for u2 in sorted(ref_matrix)] for u1 in sorted(ref_matrix)]
				for k in range(len(ref_matrix_array)):
					ref_matrix_array[k].insert(0,rower[k])
				ref_matrix_array.insert(0,head)
				#ref_matrix_array_after_reorganizing=generateHeatMap(sim_matrix_array,'./Figures/REF_heatmap_'+str(index)+'.png',vmin=0.,vmax=1.,showvalues_text=False,only_heatmap=True,organize=True)



				sim_matrix=similarity_dictionnary(all_users_to_items_outcomes,frenchdeputies,items_context)
				rower=[x['NATIONAL_PARTY']+'_'+x['NAME_FULL']+'_'+str(x[users_id_attribute]) for x in frenchdeputies_metadata]
				head=[['']]+rower
				dist_matrix_array=[[1.-sim_matrix[u1][u2] if u1<u2 else 1.-sim_matrix[u2][u1] if u1>u2 else float(0.)  for u2 in sorted(sim_matrix)] for u1 in sorted(sim_matrix)]
				sim_matrix_array=[[sim_matrix[u1][u2] if u1<u2 else sim_matrix[u2][u1] if u1>u2 else float(1.)  for u2 in sorted(sim_matrix)] for u1 in sorted(sim_matrix)]
				for k in range(len(dist_matrix_array)):
					dist_matrix_array[k].insert(0,rower[k])
					sim_matrix_array[k].insert(0,rower[k])
				dist_matrix_array.insert(0,head)
				sim_matrix_array.insert(0,head)
				# print similarity_dictionnary(all_users_to_items_outcomes,frenchdeputies,items_context)
				# raw_input('...')
				#clusteringResults,clusters,linkageMatrix=applyHierarchiqueClusteringFromDataset(frenchdeputies_metadata,dist_matrix_array,parameter=0.4)
				hierarchicalClusteringFromDataset(frenchdeputies_metadata,dist_matrix_array,dendrogrammeDestination='./Figures/fig_'+str(index)+'.png',parameter=0.4,label_dendrogramme='NAME_FULL')
				writeCSV(dist_matrix_array,'./Figures/matrix_'+str(index)+'.csv',delimiter='\t')
					
					

					


			# except Exception as e:
			# 	raise e
			# 	continue
				cp_matrice_pattern=generateHeatMap(sim_matrix_array,'./Figures/heatmap_'+str(index)+'.png',vmin=0.,vmax=1.,showvalues_text=False,only_heatmap=True,organize=True)
				cp_matrice_pattern,ref_matrix_array=reorganize_similarly(cp_matrice_pattern,ref_matrix_array)
				generateHeatMap(ref_matrix_array,'./Figures/REF_heatmap_'+str(index)+'.png',vmin=0.,vmax=1.,showvalues_text=False,only_heatmap=True,organize=False)
			
			#print context_pat,'context reliablitiy',context_reliablitiy
			results.append([index,context_pat,context_reliablitiy,cohesion,items_context,vector_full,my_reliability])
		#results.append(['*',full_reliability,set_of_entities])
		results=sorted(results,key=lambda x : x[2],reverse=False)
		to_ret=[]

		for elem in results:
			#print elem[0],elem[1]
			to_ret.append({'index':elem[0],'pattern':elem[1],'contextSize':len(elem[4]),'reliability':elem[2],'cohesion':elem[3],'full_vector':elem[5],'myreliability':elem[6]})
			# for c_1 in matrix:
			# 	for c_2 in matrix[c_1]:
			# 		print c_1,c_2,sum(matrix[c_1][c_2][e] for e in elem[2]&matrix[c_1][c_2].viewkeys())
			# for i in frenchdeputies:
			# 	print i, [all_users_to_items_outcomes[i][e] for e in sorted(elem[2]) if e in all_users_to_items_outcomes[i]]
			#raw_input('...')
	#gc.collect()
	return to_ret
Example #4
0
def workflowStage_iteratorsOnMultipeAttributes_subgroupBitwise_subgroups_tests(
        inputs={}, configuration={}, outputs={}, workflow_stats={}):
    '''
    {
    
        'id':'stage_id',
        'type':'multiple_attributes_iterator_sgbitwise_subgroups',
        'inputs': {
            'dataset':[]
            'attributes':[
                {
                    'name' : name_attribute,
                    'type' : 'themes' | 'numeric' | 'nominal',
                    'depthmax': None | 'or a value'
                }, ...
            ]
        },
        'configuration': {
            'skip_list':[],
            'bitwise':[]
        },
        'outputs':{
            'yielded_item':'',
            'yielded_index':'' 
        }
    }
    '''

    votes_attributes = inputs['votes_attributes']
    users_attributes = inputs['users_attributes']
    position_attribute = inputs['position_attribute']
    vector_of_outcome = configuration.get('vector_of_outcome', None)
    user1_scope = inputs['user_1_scope']
    user2_scope = inputs['user_2_scope']

    ####################"
    dataset = inputs['dataset']

    reviews_dataset = configuration['reviews_dataset']
    items_dataset = configuration['items_dataset']
    users_dataset = configuration['users_dataset']

    filteredDataset1 = filter_pipeline_obj(dataset, user1_scope)[0]

    filteredDataset2 = filter_pipeline_obj(dataset, user2_scope)[0]
    filteredDataset = filteredDataset1 + filteredDataset2 if (
        user1_scope <> user2_scope) else filteredDataset1
    configuration_execution = inputs['XP']

    print ''
    final_stats = []

    considered = [
        'attr_items_range', 'attr_users_range', 'upperbound',
        'attr_aggregates_range', 'nb_items_range', 'nb_users_range',
        'sigma_user_range', 'sigma_agg_range', 'sigma_item_range',
        'sigma_quality_range', 'top_k_range', 'prunning_range', 'closed_range',
        'similarity_measures', 'quality_measures'
    ]
    total = 1
    for key in considered:
        total *= len(configuration_execution[key])
    index = 0
    actu_config = []
    last_config = []
    last_time_spent = 0
    time_threshold = 3600

    start = time()
    stdout.write('\rPercentage Done : ' + '%.2f' %
                 (index * 100 / float(total)) + ' %\t' + 'Time elapsed : ' +
                 '%.2f' % (time() - start) + 's')
    for closed_inst in configuration_execution['closed_range']:
        for prunning_inst in configuration_execution['prunning_range']:
            if prunning_inst and not closed_inst: continue
            for attr_item_inst in configuration_execution['attr_items_range']:
                for attr_users_inst in configuration_execution[
                        'attr_users_range']:
                    for attr_users_agg_inst in configuration_execution[
                            'attr_aggregates_range']:
                        for nb_items_inst in configuration_execution[
                                'nb_items_range']:
                            for nb_users_inst in configuration_execution[
                                    'nb_users_range']:
                                for sigma_items_inst in configuration_execution[
                                        'sigma_item_range']:
                                    for sigma_users_inst in configuration_execution[
                                            'sigma_user_range']:
                                        for sigma_agg_users_inst in configuration_execution[
                                                'sigma_agg_range']:
                                            for sim_measure_inst in configuration_execution[
                                                    'similarity_measures']:
                                                for quality_measure_inst in configuration_execution[
                                                        'quality_measures']:
                                                    for sigma_quality_inst in configuration_execution[
                                                            'sigma_quality_range']:
                                                        for top_k_inst in configuration_execution[
                                                                'top_k_range']:
                                                            upperbound_range = configuration_execution[
                                                                'upperbound']
                                                            if not prunning_inst:
                                                                upperbound_range = [
                                                                    1
                                                                ]

                                                            for upperboundType in upperbound_range:

                                                                index += 1
                                                                input_new_attributes = [
                                                                    {
                                                                        'name':
                                                                        tupe[
                                                                            0],
                                                                        'type':
                                                                        tupe[1]
                                                                    }
                                                                    for tupe in
                                                                    attr_item_inst
                                                                ]
                                                                input_new_attributes += [
                                                                    {
                                                                        'name':
                                                                        tupe[
                                                                            0],
                                                                        'type':
                                                                        tupe[1]
                                                                    }
                                                                    for tupe in
                                                                    attr_users_inst
                                                                ]
                                                                conf_new = {
                                                                    'nb_dossiers_min':
                                                                    1,
                                                                    'threshold_pair_comparaison':
                                                                    sigma_items_inst,
                                                                    'cover_threshold':
                                                                    configuration[
                                                                        'cover_threshold'],  #SEE AGAINS HOW TO DECLARE CONDITIONS ON FREQUENT PATTERN,
                                                                    'quality_threshold':
                                                                    sigma_quality_inst,
                                                                    'top_k':
                                                                    top_k_inst,
                                                                    'iwant':
                                                                    quality_measure_inst,
                                                                    'upperbound':
                                                                    upperboundType,
                                                                    'closed':
                                                                    closed_inst,
                                                                    'pruning':
                                                                    prunning_inst,
                                                                    'nb_items':
                                                                    nb_items_inst,
                                                                    'nb_users':
                                                                    nb_users_inst,
                                                                    'aggregation_attributes_user1':
                                                                    attr_users_agg_inst,  #['ageGroup'],#['NATIONAL_PARTY'],
                                                                    'aggregation_attributes_user2':
                                                                    attr_users_agg_inst,  #['ageGroup'],#['NATIONAL_PARTY'],
                                                                    'nb_aggergation_min_user1':
                                                                    sigma_agg_users_inst,
                                                                    'threshold_nb_users_1':
                                                                    sigma_users_inst,
                                                                    'nb_aggergation_min_user2':
                                                                    sigma_agg_users_inst,
                                                                    'threshold_nb_users_2':
                                                                    sigma_users_inst,
                                                                    'comparaison_measure':
                                                                    sim_measure_inst,
                                                                    'reviews_dataset':
                                                                    reviews_dataset,
                                                                    'items_dataset':
                                                                    items_dataset,
                                                                    'users_dataset':
                                                                    users_dataset,
                                                                    'vector_of_outcome':
                                                                    vector_of_outcome
                                                                }

                                                                actu_config = [
                                                                    conf_new[
                                                                        'closed'],
                                                                    conf_new[
                                                                        'pruning'],
                                                                    conf_new[
                                                                        'upperbound']
                                                                ]
                                                                if actu_config == last_config and last_time_spent >= time_threshold:
                                                                    continue

                                                                attr_users_nb = len([
                                                                    x for x in
                                                                    input_new_attributes
                                                                    if
                                                                    x['name']
                                                                    in
                                                                    users_attributes
                                                                ])
                                                                #                                                                 if attr_users_nb==0:
                                                                #                                                                     iterator=dsc_c_method(filteredDataset,input_new_attributes,conf_new,user1_scope,user2_scope,votes_attributes,users_attributes,position_attribute)
                                                                #                                                                 else:
                                                                #                                                                     iterator=dsc_uuc_method(filteredDataset,input_new_attributes,conf_new,user1_scope,user2_scope,votes_attributes,users_attributes,position_attribute)
                                                                #
                                                                iterator = DSCFORPERF(
                                                                    filteredDataset,
                                                                    input_new_attributes,
                                                                    conf_new,
                                                                    user1_scope,
                                                                    user2_scope,
                                                                    votes_attributes,
                                                                    users_attributes,
                                                                    position_attribute
                                                                )

                                                                #next(iterator)

                                                                for pattern, description, pairwiseStatistics, quality, upper_bound, dossiers_voted in iterator:

                                                                    outputs[
                                                                        'yielded_item'] = pattern
                                                                    outputs[
                                                                        'yielded_description'] = description
                                                                    outputs[
                                                                        'yielded_index'] = index
                                                                    outputs[
                                                                        'quality'] = quality
                                                                    outputs[
                                                                        'upper_bound'] = True if index == 1 else False
                                                                    outputs[
                                                                        'dossiers_voted'] = dossiers_voted
                                                                    #outputs['yielded_bitwise']=bitwise
                                                                    outputs[
                                                                        'pairwiseStatistics'] = pairwiseStatistics
                                                                    #yield outputs
                                                                    final_stats = final_stats + dossiers_voted
                                                                    #print dossiers_voted
                                                                    yield dossiers_voted
                                                                last_config = actu_config[:]
                                                                last_time_spent = dossiers_voted[
                                                                    0]['#timespent']
                                                                stdout.write(
                                                                    '\rPercentage Done : '
                                                                    + '%.2f' %
                                                                    (index *
                                                                     100 /
                                                                     float(
                                                                         total)
                                                                     ) +
                                                                    ' %\t' +
                                                                    'Time elapsed : '
                                                                    + '%.2f' %
                                                                    (time() -
                                                                     start) +
                                                                    's')

    outputs['dossiers_voted'] = final_stats
Example #5
0
def extractStatistics_fromdataset_vectors(dataset,votesAttributes,usersAttributes,position_attribute,user1_scope=[],user2_scope=[]):
    vote_identifier=str(votesAttributes[0])
    user_identifier=str(usersAttributes[0])
    user_1_identifier='USER1'
    user_2_identifier='USER2'
    
    
    
    mepsStats={}
    mepsMeta={}
    listsOfVotes={}
    listsOfVotesHasKey=listsOfVotes.has_key
    
    mapping_user_vote={'for':'Y','against':'N','abstain':'A'}
    users_map_details={}
    users_map_details_has_key=users_map_details.has_key
    for obj in dataset : 
        vote_id=str(obj[vote_identifier])
        if not (listsOfVotesHasKey(vote_id)) :
            listsOfVotes[vote_id]=[]
        listsOfVotes[vote_id].append(obj) 
        
        
        d_user_id=obj[user_identifier]
        
        if (not users_map_details_has_key(d_user_id)):
            users_map_details[d_user_id]={key:obj[key] for key in usersAttributes}   
     
    users_map_details_array=users_map_details.values()
    
    users_map_details_array_filtered_user1=filter_pipeline_obj(users_map_details_array, user1_scope)[0]
    users_map_details_array_filtered_user2=filter_pipeline_obj(users_map_details_array, user2_scope)[0]
    
    users1_ids=set([x[user_identifier] for x in users_map_details_array_filtered_user1])
    users2_ids=set([x[user_identifier] for x in users_map_details_array_filtered_user2])
    
    nb_votes_all=len(listsOfVotes)
    iterValues=listsOfVotes.iteritems()
    for vote_id,actualVote in iterValues:
        for mep1_object in actualVote :
            if mep1_object[user_identifier] in users1_ids :
                mep1_index=str(mep1_object[user_identifier])
                try:
                    pairwiseStatsRowOfMep1=mepsStats[mep1_index] 
                except KeyError:
                    mepsStats[mep1_index],mepsMeta[mep1_index]={},{attribute_user:mep1_object[attribute_user] for attribute_user in usersAttributes}
                    pairwiseStatsRowOfMep1=mepsStats[mep1_index]    
                
                for mep2_object in actualVote:
                        if mep2_object[user_identifier] in users2_ids :
                            mep2_index=str(mep2_object[user_identifier])
                            try:
                                pairOfMeps=pairwiseStatsRowOfMep1[mep2_index]
                                
                            except KeyError :
                                pairwiseStatsRowOfMep1[mep2_index]={user_1_identifier : mep1_index,user_2_identifier : mep2_index,'NB_VOTES':0,'**':{},'ALL_VOTES':nb_votes_all,'FLAGCOMPUTED':False}  #FLAGCOMPUTED if computed then informations in stats matrix are vector similarity results and not vectos themselves
                                mepsMeta[mep2_index]={attribute_user:mep2_object[attribute_user] for attribute_user in usersAttributes}
                                pairOfMeps=pairwiseStatsRowOfMep1[mep2_index]     

                            vector_mepwise12=(mep1_object[position_attribute],mep2_object[position_attribute])
                            pairOfMeps['**'][vote_id] = vector_mepwise12
                            pairOfMeps['NB_VOTES']=pairOfMeps['NB_VOTES']+1
    
    ################TO ADD EMPTY SET FOR PAIRS WHO HAD NEVER VOTED TOGETHER##############
    for mep1_index in users1_ids:
        for mep2_index in users2_ids:
            if mepsStats.has_key(mep1_index) and not mepsStats[mep1_index].has_key(mep2_index):
                mepsStats[mep1_index][mep2_index]={user_1_identifier : mep1_index,user_2_identifier : mep2_index,'NB_VOTES':0,'**':{},'ALL_VOTES':nb_votes_all,'FLAGCOMPUTED':False}
            mepsStats[mep1_index][mep2_index]['KEYS']=set(mepsStats[mep1_index][mep2_index]['**'])
    
    ################TO ADD EMPTY SET FOR PAIRS WHO HAD NEVER VOTED TOGETHER##############    
    
    return mepsStats,mepsMeta