def generic_enumerators_top_k(dataset,attributes,configuration,user1_scope,user2_scope,votes_attributes,users_attributes,position_attribute,closed=True): #disagreement spent=0 visited_patterns_more_detailed=set() nb_quality_measured=0 interesting_patterns=[] len_attributes=len(attributes) range_len_attributes=range(len_attributes) top_k=configuration.get('top_k',float('inf')) if top_k is None : top_k=float('inf') quality_threshold=float(configuration.get('quality_threshold',0)) if quality_threshold is None: quality_threshold=0 pruning = configuration.get('pruning',False) iwant=configuration['iwant'] nb_dossiers_min=configuration['nb_dossiers_min'] cover_threshold=configuration['cover_threshold'] threshold_pair_comparaison=float(configuration['threshold_pair_comparaison']) comparaison_measure=configuration['comparaison_measure'] subpattern_votes_filter=[True if attr['name'] in votes_attributes else False for attr in attributes ] #subpattern_users_filter=[True if attr['name'] in users_attributes else False for attr in attributes ] vote_id_attributes=votes_attributes[0] users_id_attributes=users_attributes[0] patterns_visited_valid={} ############################### GET DISTINCT VALUES #################################### #datasetFiltered=filter_pipeline_obj(dataset, user2_scope)[0] user2_scope_values={x[users_id_attributes] for x in filter_pipeline_obj(dataset, user2_scope)[0]} #print user2_scope_values votes_map_details,votes_map_meps,users_map_details,users_map_votes,arr_distinct_values=get_distinct_values(dataset,attributes,votes_attributes,users_attributes,position_attribute,user2_scope_values) votes_map_details_array=votes_map_details.values() users_map_details_array=users_map_details.values() ############################################################################################## users_map_details_array_filtered_user1=filter_pipeline_obj(users_map_details_array, user1_scope)[0] users_map_details_array_filtered_user2=filter_pipeline_obj(users_map_details_array, user2_scope)[0] users_ids=set([obj[users_id_attributes] for obj in users_map_details_array]) users1_ids=set([p_attr[users_id_attributes] for p_attr in users_map_details_array_filtered_user1]) users2_ids=set([p_attr[users_id_attributes] for p_attr in users_map_details_array_filtered_user2]) nb_users_voted=len(users1_ids)+len(users2_ids) ############################STATS for * ###################################### original_mepsStatistics,original_mepsStatsNumbers,original_mepsMeta = extractStatistics_fromdataset_vectors(dataset,votes_attributes,users_attributes,position_attribute,user1_scope,user2_scope) original_mepwise_similarities={} for user1 in original_mepsStatsNumbers: original_mepwise_similarities[user1]={} for user2 in original_mepsStatsNumbers[user1]: original_mepwise_similarities[user1][user2]=get_sim_vectors(original_mepsStatsNumbers,user1,user2,comparaison_measure) ############################################################################## arr_data,arr_types,arr_depthmax,arr_refinement_indexes,arr_labels,subgroup_pipeline,filter_operations,subgroup_pipeline_for_votes,subgroup_pipeline_for_meps = get_arrdata_from_dataset_values(arr_distinct_values, attributes,votes_attributes,users_attributes,position_attribute) configuration['stats']=original_mepsStatistics configuration['skip_list']=[] configuration['closing']=[None]*len(attributes) #configuration['closing_refin']=[None]*len(attributes) configuration['closing_refin']=[([''],None)]*len(attributes) enumerator=generic_enumerator_multiattributes_dfs(arr_data,arr_types,arr_refinement_indexes,arr_depthmax,configuration,bitwise=None,stats=original_mepsStatistics) index,index_has_been_visited,index_frequent,index_valid,index_non_valid,index_good=0,0,0,0,0,0 ##################################TO FAST PROCESS THEMES################################# for attr in attributes: if attr['type'] in ['themes','themes2']: dimensionName=attr['name'] for obj in votes_map_details_array: s2_arr=[] obj_idlabels=set() s2_arr_extend=s2_arr.extend for val in obj[dimensionName]: v_theme_id=val[:val.index(' ')] #print v_theme_id obj_idlabels|={v_theme_id} v=v_theme_id.split('.') s2_arr_extend(['.'.join(v[0:x+1]) for x in range(len(v))]) s2=set(s2_arr) obj[dimensionName+'0']=obj_idlabels#obj[dimensionName][:] if obj[dimensionName]==[]: obj[dimensionName+'0']=[' '] obj[dimensionName]=s2 ##################################TO FAST PROCESS THEMES################################# for p,bitwise_p,stats in enumerator: #print p zip_pvote_filter=zip(p,subpattern_votes_filter) p_vote=[];p_vote_append=p_vote.append p_mep=[];p_mep_append=p_mep.append tuple_p=tuple();attr_ind=0 for p_attr,y in zip_pvote_filter: if y: p_vote_append(p_attr) else: p_mep_append(p_attr) tuple_p+=(tuple(p_attr),) subgroup_pipeline[attr_ind][filter_operations[attr_ind]]=p_attr attr_ind+=1 index+=1 #################################COMPUTE SUBGROUP et CHECK IF IT IS A FREQUENT PATTERN##########################" filteredDataset_votes,bitwise = filter_pipeline_obj(votes_map_details_array, subgroup_pipeline_for_votes, bitwise_p) nb_votes=len(filteredDataset_votes) #print p,set().union(*[obj[attributes[0]['name']] for obj in filteredDataset_votes]) # raw_input('...') configuration['bitwise']=bitwise users_map_details_array_filtered_user2_pattern,unused_bitwise=filter_pipeline_obj(users_map_details_array_filtered_user2, subgroup_pipeline_for_meps) users2_ids_pattern=set([p_attr[users_id_attributes] for p_attr in users_map_details_array_filtered_user2_pattern]) v_ids=set() dossiers_ids=set() #8479 4934 3733 3514 176 if nb_votes<=threshold_pair_comparaison : configuration['skip_list']=[p] if nb_votes<threshold_pair_comparaison : continue for obj in filteredDataset_votes: v_ids |= {obj[vote_id_attributes]} dossiers_ids |= {obj['DOSSIERID']} nb_dossiers=len(dossiers_ids) if nb_dossiers<=nb_dossiers_min : configuration['skip_list']=[p] if nb_dossiers<nb_dossiers_min: continue p_more_detailed=p #tuple_p_more_detailed=tuple_p ################CLOSED################### start=time() if closed: p_more_detailed=[];p_more_detailed_append=p_more_detailed.append arr_data_detailed=[];arr_data_detailed_append= arr_data_detailed.append arr_refin_detailed=[];arr_refin_detailed_append=arr_refin_detailed.append tuple_p_more_detailed=tuple() for i in range_len_attributes: attr=attributes[i]; attr_type=attr['type'] attr_name=attr['name'] if attr_type not in ['themes','themes2'] else attr['name']+'0' depth_max=attr['bound_width'] subp=p[i] values=[x[attr_name] for x in filteredDataset_votes] min_p,min_arr,min_rein=POSSIBLE_ENUMERATOR_MINIMAL_DESCRIPTION[attr_type](values) min_p = subp if min_p is None else min_p p_more_detailed_append(min_p) # if depth_max>len(min_p): # arr_data_detailed_append(min_arr) # arr_refin_detailed_append(subp) # else : toContinueFrom=closure_continue_from(subp,min_p) #print subp,'-',min_p,'-',toContinueFrom arr_data_detailed_append(min_arr) arr_refin_detailed_append((toContinueFrom,min_rein)) tuple_p_more_detailed+=(tuple(p_more_detailed[-1]),) configuration['closing']=arr_data_detailed configuration['closing_refin']=arr_refin_detailed if not all (POSSIBLE_ENUMERATOR_RESPECT_ORDER[attributes[i]['type']](p[i],p_more_detailed[i]) for i in range_len_attributes): #print p,p_more_detailed spent+=time()-start configuration['skip_list']=[p] continue # if closed and all(p[i]==p_more_detailed[i] for i in range_len_attributes if attributes[i]['type']<>'themes' ) and not all(respect_order(p[i],p_more_detailed[i]) for i in range_len_attributes if attributes[i]['type']=='themes' ): # if tuple_p_more_detailed in visited_patterns_more_detailed: # spent+=time()-start # configuration['skip_list']=[p] # continue # visited_patterns_more_detailed|={tuple_p_more_detailed} spent+=time()-start ################################################ filteredDataset_meps_votes={} users_ids_set=set() max_votes_pairwise=0 for key in users_ids: value=users_map_votes[key] votes_user=(value & v_ids) len_votes_user=len(votes_user) if len_votes_user>=threshold_pair_comparaison: filteredDataset_meps_votes[key]=votes_user users_ids_set|={key} max_votes_pairwise=len_votes_user if len_votes_user>max_votes_pairwise else max_votes_pairwise if max_votes_pairwise<=threshold_pair_comparaison : configuration['skip_list']=[p] if max_votes_pairwise<threshold_pair_comparaison : continue users1_ids_set=users1_ids & users_ids_set users2_ids_set=users2_ids & users_ids_set & users2_ids_pattern ###################NEEEEEW############################# nb_users_voted=len(users1_ids)+len(users2_ids_pattern) ###################NEEEEEW############################# #max_votes_pairwise=max(len(value) for key,value in filteredDataset_meps_votes.iteritems()) index_frequent+=1 ########################################################################################## ######################################COVER COMPUTING###################################### if closed: if tuple_p_more_detailed in visited_patterns_more_detailed: spent+=time()-start #configuration['skip_list']=[p] index_has_been_visited+=1 continue visited_patterns_more_detailed|={tuple_p_more_detailed} if not closed or cover_threshold<1.: max_cover,b_max_cover,pattern_max,bitarray_bitwise,bitarray_bitwise_count=cover_max_computation_new(p,p_vote,p_mep, bitwise, patterns_visited_valid, cover_threshold) if (max_cover>=cover_threshold): index_has_been_visited+=1 if (pruning and b_max_cover<quality_threshold): configuration['skip_list']=[p] continue # if tuple_p_more_detailed in visited_patterns_more_detailed: # continue # visited_patterns_more_detailed|={tuple_p_more_detailed} # max_cover_dossier=0 # for ancient_dossiers_visited in dossiers_ids_visited: # # max_cover_dossier= max(max_cover_dossier,len(ancient_dossiers_visited & dossiers_ids) / float(len(ancient_dossiers_visited))) # if (max_cover_dossier>0.3): # continue ######################################################################################################### returned_mepsStatistics,returned_mepsStatsNumbers,returned_mepsMeta = extractStatistics_fromdataset_new_update_not_square_vectors(stats, original_mepsMeta, votes_attributes, users_attributes, position_attribute, v_ids, users_ids_set,users1_ids_set,users2_ids_set) configuration['stats']=returned_mepsStatistics issquare=(users1_ids_set==users2_ids_set) #print issquare ######################################UPPER BOUND EXPECTED COMPUTING###################################### original_model,pattern_model=compute_models(original_mepwise_similarities,returned_mepsStatsNumbers,comparaison_measure,issquare) quality,borne_max_quality=compute_quality(original_model,pattern_model,threshold_pair_comparaison,nb_users_voted,iwant) nb_quality_measured+=1 ######################################UPPER BOUND EXPECTED COMPUTING###################################### index_non_valid+=1 if (pruning and borne_max_quality<quality_threshold): configuration['skip_list']=[p] continue index_valid+=1 label = [POSSIBLE_ENUMERATOR_DESCRIPTION_YIELDER[arr_types[attr_ind]](p_more_detailed[attr_ind],arr_labels[attr_ind]) for attr_ind in range(len(p_more_detailed))] #print p,'\t',nb_dossiers,'\t',len(v_ids),'\t',max_cover,'\t',quality,'\t',borne_max_quality,'\t',quality>=quality_threshold,'\t',index,'\t',index_frequent,'\t',index_non_valid,'\t',index_valid,'\t',index_good print p_more_detailed,'\t',quality,'\t',borne_max_quality,'\t',nb_votes # parent_vote,'\t',parent_mep,'\t', #raw_input('...') if (quality>=quality_threshold): if not closed or cover_threshold<1.: patterns_visited_valid[tuple_p]={'pattern':p,'dossiers':dossiers_ids,'bitwise':bitarray_bitwise,'bitwise_count':bitarray_bitwise_count,'upper_bound':borne_max_quality,'pattern_vote':p_vote,'pattern_user':p_mep} dataset_stats=datasetStatistics(returned_mepsStatsNumbers, returned_mepsMeta,votes_attributes,users_attributes,position_attribute) index_good+=1 dossiers_voted=sorted([p_attr for p_attr in set([(v['PROCEDURE_TITLE'],float('%.2f' % ((sum([1 for p_attr in filteredDataset_votes if p_attr['PROCEDURE_TITLE']==v['PROCEDURE_TITLE']])/float(len(v_ids)))*100) ),sum([1 for p_attr in filteredDataset_votes if p_attr['PROCEDURE_TITLE']==v['PROCEDURE_TITLE']])) for v in filteredDataset_votes])],key=lambda p_attr : p_attr[1],reverse=True) interesting_patterns.append([p_more_detailed,label,dataset_stats,quality,borne_max_quality,dossiers_voted]) if len(interesting_patterns)>top_k: interesting_patterns=sorted(interesting_patterns,key=lambda p_attr : p_attr [3],reverse=True)[:top_k] quality_threshold=interesting_patterns[-1][3] print index,'\t',index_has_been_visited,'\t',index_frequent,'\t',index_non_valid,'\t',index_valid,'\t',index_good print 'TIMESPENT = ',spent for p,label,dataset_stats,quality,borne_max_quality,dossiers_voted in sorted(interesting_patterns,key=lambda p_attr : p_attr [3],reverse=True): yield p,label,dataset_stats,quality,borne_max_quality,dossiers_voted
def process_outcome_dataset(itemsfile, usersfile, outcomesfile, numeric_attrs=[], array_attrs=[], outcome_attrs=None, method_aggregation_outcome='VECTOR_VALUES', itemsScope=[], contexts_scope=[], users_1_Scope=[], users_2_Scope=[], nb_items=float('inf'), nb_individuals=float('inf'), attributes_to_consider=None, nb_items_entities=float('inf'), nb_items_individuals=float('inf'), hmt_to_itemset=False, delimiter='\t'): FULL_OUTCOME_CONSIDERED = False ITEMS_METADATA_NEEDED = False USE_CACHE = False SIZE_ESTIMATION = False SMALLER_DESCRIPTION_SPACE = True VERBOSE = False CLARIFY = True to_consider_ids_in_contexts_scope = None nb_outcome_considered = 0 if 'CACHE' in dir(process_outcome_dataset): items, items_header, users, users_header, outcomes, outcomes_header, items_id, users_id, outcome_attrs, position_attr, outcomes_processed, considered_items, users1, users2, ( considered_items_ids), (considered_users_1_ids), ( considered_users_2_ids), ( considered_users_ids) = process_outcome_dataset.CACHE else: items, items_header = readCSVwithHeader(itemsfile, numberHeader=numeric_attrs, arrayHeader=array_attrs, selectedHeader=None, delimiter=delimiter) users, users_header = readCSVwithHeader(usersfile, numberHeader=numeric_attrs, arrayHeader=array_attrs, selectedHeader=None, delimiter=delimiter) outcomes, outcomes_header = readCSVwithHeader( outcomesfile, numberHeader=numeric_attrs, arrayHeader=array_attrs, selectedHeader=None, delimiter=delimiter) items_id = items_header[0] users_id = users_header[0] outcome_attrs = outcome_attrs if outcome_attrs is not None else [ outcomes_header[2] ] position_attr = outcome_attrs[0] outcomes_processed, vector_of_action = outcome_representation_in_reviews( outcomes, position_attr, outcome_attrs, method_aggregation_outcome) considered_items = filter_pipeline_obj(items, itemsScope)[0] considered_items_in_contexts_scope = filter_pipeline_obj( items, contexts_scope)[0] users1 = filter_pipeline_obj(users, users_1_Scope)[0] users2 = filter_pipeline_obj(users, users_2_Scope)[0] get_items_ids = partial(map, itemgetter(items_id)) get_users_ids = partial(map, itemgetter(users_id)) considered_items_ids = set(get_items_ids(considered_items)) to_consider_ids_in_contexts_scope = set( get_items_ids(considered_items_in_contexts_scope)) considered_users_1_ids = set(get_users_ids(users1)) considered_users_2_ids = set(get_users_ids(users2)) considered_users_ids = set(considered_users_1_ids) | set( considered_users_2_ids) if USE_CACHE: process_outcome_dataset.CACHE = [ items, items_header, users, users_header, outcomes, outcomes_header, items_id, users_id, outcome_attrs, position_attr, outcomes_processed, considered_items[:], users1[:], users2[:], set(considered_items_ids), set(considered_users_1_ids), set(considered_users_2_ids), set(considered_users_ids) ] if nb_items < float('inf'): nb_items = int(nb_items) considered_items_ids = set(sorted(considered_items_ids)[:nb_items]) considered_items = [ x for x in considered_items if x[items_id] in considered_items_ids ] if nb_individuals < float('inf'): nb_individuals = int(nb_individuals) considered_users_ids = set( sorted(considered_users_ids)[:nb_individuals]) users1 = [x for x in users1 if x[users_id] in considered_users_ids] users2 = [x for x in users2 if x[users_id] in considered_users_ids] all_users_to_items_outcomes = {} outcomes_considered = [] outcomes_considered_append = outcomes_considered.append items_metadata = { row[items_id]: row for row in considered_items if row[items_id] in considered_items_ids } if ITEMS_METADATA_NEEDED else {} users_metadata = { row[users_id]: row for row in users if row[users_id] in considered_users_ids } for row in outcomes_processed: v_id_rev = row[items_id] u_id_rev = row[users_id] if v_id_rev in considered_items_ids and u_id_rev in considered_users_ids: pos_rev = row[position_attr] if u_id_rev not in all_users_to_items_outcomes: all_users_to_items_outcomes[u_id_rev] = {} all_users_to_items_outcomes[u_id_rev][v_id_rev] = pos_rev if FULL_OUTCOME_CONSIDERED: outcomes_considered_append({ items_id: v_id_rev, users_id: u_id_rev, position_attr: pos_rev }) nb_outcome_considered += 1 considered_users_1_sorted = sorted(users1, key=itemgetter(users_id)) considered_users_2_sorted = sorted(users2, key=itemgetter(users_id)) considered_items_sorted = sorted(considered_items, key=itemgetter(items_id)) if SIZE_ESTIMATION: from pympler.asizeof import asizeof print asizeof(all_users_to_items_outcomes) print asizeof(considered_items_sorted) print asizeof(considered_users_1_sorted) print asizeof(considered_users_2_sorted) gc.collect() if SMALLER_DESCRIPTION_SPACE: NB_SELECTED_ITEMSET_ENTITIES = nb_items_entities #100 NB_SELECTED_ITEMSET_INDIVIDUALS = nb_items_individuals #100 from enumerator.enumerator_attribute_complex import init_attributes_complex, create_index_complex from enumerator.enumerator_attribute_themes2 import tree_leafs, tree_remove #####################################ENTITIES - DESCRIPTION SPACE LIMITING########################################## concerned_attributes_entities_numeric = sorted( set(attributes_to_consider) & set(numeric_attrs) & set(items_header)) concerned_attributes_entities_hmt = sorted( set(attributes_to_consider) & set(array_attrs) & set(items_header)) concerned_attributes_entities_categorical = sorted( set(attributes_to_consider) & set(items_header) - (set(concerned_attributes_entities_hmt) | set(concerned_attributes_entities_numeric))) attributes_plain = [{ 'name': a, 'type': 'themes' } for a in concerned_attributes_entities_hmt] + [{ 'name': a, 'type': 'numeric' } for a in concerned_attributes_entities_numeric] + [{ 'name': a, 'type': 'simple' } for a in concerned_attributes_entities_categorical] attributes = [{ 'name': a, 'type': 'themes' } for a in concerned_attributes_entities_hmt] + [{ 'name': a, 'type': 'numeric' } for a in concerned_attributes_entities_numeric] + [{ 'name': a, 'type': 'simple' } for a in concerned_attributes_entities_categorical] attributes = init_attributes_complex(considered_items_sorted, attributes) #X index_all = create_index_complex(considered_items_sorted, attributes) #Y nb_itemsets_all = 0 for attr in attributes: if attr['type'] in {'numeric'}: nb_itemsets_all += 2 * len(attr['domain']) attr['nb_items'] = 2 * len(attr['domain']) else: nb_itemsets_all += len(attr['domain']) attr['nb_items'] = len(attr['domain']) if NB_SELECTED_ITEMSET_ENTITIES <= 1: NB_SELECTED_ITEMSET_ENTITIES = int(nb_itemsets_all * NB_SELECTED_ITEMSET_ENTITIES) print "NB_SELECTED_ITEMSET_ENTITIES AFTER RATIO = ", NB_SELECTED_ITEMSET_ENTITIES nb_itemsets_all_context = NB_SELECTED_ITEMSET_ENTITIES nb_itemsets_to_remove = max( 0, nb_itemsets_all - NB_SELECTED_ITEMSET_ENTITIES) if nb_itemsets_to_remove > 0: if VERBOSE: print 'Entities Search Space Modification : ', NB_SELECTED_ITEMSET_ENTITIES factor = NB_SELECTED_ITEMSET_ENTITIES / float(nb_itemsets_all) #print factor for attr in attributes: attr['nb_items_new'] = int(round(factor * attr['nb_items'])) attr['nb_items_to_remove'] = attr['nb_items'] - attr[ 'nb_items_new'] attr_name = attr['name'] attr_type = attr['type'] attr_domain = attr['domain'] attr_labelmap = attr['labelmap'] if attr_type == 'themes': tags_removed = set( tree_remove(attr_domain, attr['nb_items_to_remove'])) tags_keeped = set(attr_domain) - tags_removed for i, o in enumerate(index_all): o[attr_name] = o[attr_name] & tags_keeped considered_items_sorted[i][attr_name] = [ attr_labelmap[t] if t != '' else t for t in sorted(o[attr_name]) ] elif attr_type == 'numeric': values_to_remove = int(attr['nb_items_to_remove'] / 2) values_to_keep = len(attr['domain']) - values_to_remove if values_to_keep == 0: values_to_keep = 1 new_attr_domain = [ attr_domain[int( round((k / float(values_to_keep)) * len(attr_domain)))] for k in range(values_to_keep) ] mapto = { x: new_attr_domain[bisect_right(new_attr_domain, x) - 1] for x in attr_domain } for i, o in enumerate(index_all): o[attr_name] = mapto[o[attr_name]] considered_items_sorted[i][attr_name] = o[attr_name] elif attr_type == 'simple': if attr['nb_items_to_remove'] >= len(attr['domain']): attr['nb_items_to_remove'] = -1 groupOfValues = 'From ' + attr['domain'][ 0] + ' To ' + attr['domain'][ attr['nb_items_to_remove']] new_attr_domain = [ groupOfValues ] + attr['domain'][attr['nb_items_to_remove'] + 1:] mapto = { x: x if x in new_attr_domain else groupOfValues for x in attr_domain } for i, o in enumerate(index_all): o[attr_name] = mapto[o[attr_name]] considered_items_sorted[i][attr_name] = o[attr_name] else: if VERBOSE: print 'no Entities Search Space Modification' #####################################ENTITIES - DESCRIPTION SPACE LIMITING########################################## #####################################INDIVIDUALS - DESCRIPTION SPACE LIMITING########################################## concerned_attributes_individuals_numeric = sorted( set(attributes_to_consider) & set(numeric_attrs) & set(users_header)) concerned_attributes_individuals_hmt = sorted( set(attributes_to_consider) & set(array_attrs) & set(users_header)) concerned_attributes_individuals_categorical = sorted( set(attributes_to_consider) & set(users_header) - (set(concerned_attributes_individuals_hmt) | set(concerned_attributes_individuals_numeric))) attributes_plain = [{ 'name': a, 'type': 'themes' } for a in concerned_attributes_individuals_hmt] + [{ 'name': a, 'type': 'numeric' } for a in concerned_attributes_individuals_numeric] + [{ 'name': a, 'type': 'simple' } for a in concerned_attributes_individuals_categorical] attributes = [{ 'name': a, 'type': 'themes' } for a in concerned_attributes_individuals_hmt] + [{ 'name': a, 'type': 'numeric' } for a in concerned_attributes_individuals_numeric] + [{ 'name': a, 'type': 'simple' } for a in concerned_attributes_individuals_categorical] considered_users_sorted = users_metadata.values() attributes = init_attributes_complex(considered_users_sorted, attributes) #X index_all = create_index_complex(considered_users_sorted, attributes) #Y nb_itemsets_all = 0 for attr in attributes: if attr['type'] in {'numeric'}: nb_itemsets_all += 2 * len(attr['domain']) attr['nb_items'] = 2 * len(attr['domain']) else: nb_itemsets_all += len(attr['domain']) attr['nb_items'] = len(attr['domain']) if NB_SELECTED_ITEMSET_INDIVIDUALS <= 1: #print "nb_itemsets_all",nb_itemsets_all,NB_SELECTED_ITEMSET_INDIVIDUALS NB_SELECTED_ITEMSET_INDIVIDUALS = int( nb_itemsets_all * NB_SELECTED_ITEMSET_INDIVIDUALS) print "NB_SELECTED_ITEMSET_INDIVIDUALS AFTER RATIO = ", NB_SELECTED_ITEMSET_INDIVIDUALS nb_itemsets_all_individuals = nb_itemsets_all nb_itemsets_to_remove = max( 0, nb_itemsets_all - NB_SELECTED_ITEMSET_INDIVIDUALS) if nb_itemsets_to_remove > 0: if VERBOSE: print 'Individuals Search Space Modification : ', NB_SELECTED_ITEMSET_INDIVIDUALS factor = NB_SELECTED_ITEMSET_INDIVIDUALS / float(nb_itemsets_all) #print factor for attr in attributes: attr['nb_items_new'] = int(round(factor * attr['nb_items'])) attr['nb_items_to_remove'] = attr['nb_items'] - attr[ 'nb_items_new'] attr_name = attr['name'] attr_type = attr['type'] attr_domain = attr['domain'] attr_labelmap = attr['labelmap'] if attr_type == 'themes': tags_removed = set( tree_remove(attr_domain, attr['nb_items_to_remove'])) tags_keeped = set(attr_domain) - tags_removed for i, o in enumerate(index_all): o[attr_name] = o[attr_name] & tags_keeped considered_users_sorted[i][attr_name] = [ attr_labelmap[t] if t != '' else t for t in sorted(o[attr_name]) ] elif attr_type == 'numeric': values_to_remove = int(attr['nb_items_to_remove'] / 2) values_to_keep = len(attr['domain']) - values_to_remove if values_to_keep == 0: values_to_keep = 1 new_attr_domain = [ attr_domain[int( round((k / float(values_to_keep)) * len(attr_domain)))] for k in range(values_to_keep) ] mapto = { x: new_attr_domain[bisect_right(new_attr_domain, x) - 1] for x in attr_domain } for i, o in enumerate(index_all): o[attr_name] = mapto[o[attr_name]] considered_users_sorted[i][attr_name] = o[attr_name] elif attr_type == 'simple': # print len(attr['domain']) # print attr['nb_items_to_remove'] # print attr['nb_items_new'] if attr['nb_items_to_remove'] >= len(attr['domain']): attr['nb_items_to_remove'] = -1 groupOfValues = 'From ' + attr['domain'][ 0] + ' To ' + attr['domain'][ attr['nb_items_to_remove']] new_attr_domain = [ groupOfValues ] + attr['domain'][attr['nb_items_to_remove'] + 1:] mapto = { x: x if x in new_attr_domain else groupOfValues for x in attr_domain } for i, o in enumerate(index_all): o[attr_name] = mapto[o[attr_name]] considered_users_sorted[i][attr_name] = o[attr_name] else: if VERBOSE: print 'no Individuals Search Space Modification' users1 = [ x for x in considered_users_sorted if x[users_id] in considered_users_1_ids ] users2 = [ x for x in considered_users_sorted if x[users_id] in considered_users_2_ids ] considered_users_1_sorted = sorted(users1, key=itemgetter(users_id)) considered_users_2_sorted = sorted(users2, key=itemgetter(users_id)) #####################################INDIVIDUALS - DESCRIPTION SPACE LIMITING########################################## items_metadata = { row[items_id]: row for row in considered_items_sorted if row[items_id] in considered_items_ids } if ITEMS_METADATA_NEEDED else {} users_metadata = { row[users_id]: row for row in considered_users_sorted if row[users_id] in considered_users_ids } # if CLARIFY: # concerned_attributes_entities_numeric=sorted(set(attributes_to_consider)&set(numeric_attrs)&set(items_header)) # concerned_attributes_entities_hmt=sorted(set(attributes_to_consider)&set(array_attrs)&set(items_header)) # concerned_attributes_entities_categorical=sorted(set(attributes_to_consider)&set(items_header)-(set(concerned_attributes_entities_hmt)|set(concerned_attributes_entities_numeric))) # attributes=[{'name':a,'type':'themes'} for a in concerned_attributes_entities_hmt]+[{'name':a,'type':'numeric'} for a in concerned_attributes_entities_numeric]+[{'name':a,'type':'simple'} for a in concerned_attributes_entities_categorical] # clarify_dataset(considered_items_sorted,attributes,items_id) #print considered_items_sorted[0] TOITEMSET = hmt_to_itemset if TOITEMSET: print ' ' print 'Transform HMT to Itemset ...' print ' ' if TOITEMSET: concerned_attributes_entities_numeric = sorted( set(attributes_to_consider) & set(numeric_attrs) & set(items_header)) concerned_attributes_entities_hmt = sorted( set(attributes_to_consider) & set(array_attrs) & set(items_header)) concerned_attributes_entities_categorical = sorted( set(attributes_to_consider) & set(items_header) - (set(concerned_attributes_entities_hmt) | set(concerned_attributes_entities_numeric))) attributes_plain = [{ 'name': a, 'type': 'themes' } for a in concerned_attributes_entities_hmt] + [{ 'name': a, 'type': 'numeric' } for a in concerned_attributes_entities_numeric] + [{ 'name': a, 'type': 'simple' } for a in concerned_attributes_entities_categorical] attributes = [{ 'name': a, 'type': 'themes' } for a in concerned_attributes_entities_hmt] + [{ 'name': a, 'type': 'numeric' } for a in concerned_attributes_entities_numeric] + [{ 'name': a, 'type': 'simple' } for a in concerned_attributes_entities_categorical] attributes = init_attributes_complex(considered_items_sorted, attributes) #X index_all = create_index_complex(considered_items_sorted, attributes) #Y for attr in attributes: #attr['nb_items_new']=int(round(factor*attr['nb_items'])) #attr['nb_items_to_remove']=attr['nb_items']-attr['nb_items_new'] attr_name = attr['name'] attr_type = attr['type'] attr_domain = attr['domain'] attr_labelmap = attr['labelmap'] index_tag_items = { t: str(i).zfill(5) for i, t in enumerate(attr_domain.keys()) } index_tag_items = { k: v + " " + attr_labelmap[k].partition(" ")[-1] for k, v in index_tag_items.iteritems() } if attr_type == 'themes': for i, o in enumerate(index_all): # print ' ' # print considered_items_sorted[i][attr_name] considered_items_sorted[i][attr_name] = [ index_tag_items[t] for t in sorted(o[attr_name]) if t != '' ] # print considered_items_sorted[i][attr_name] # print ' ' # raw_input('....') REPLACING_IDS = False if REPLACING_IDS: ind_to_dict_items = { i: x[items_id] for i, x in enumerate(considered_items_sorted) } dict_to_ind_items = {v: k for k, v in ind_to_dict_items.iteritems()} considered_items_sorted = [ dict([(items_id, dict_to_ind_items[x[items_id]])] + [(k, v) for k, v in x.items() if k != items_id]) for x in considered_items_sorted ] to_consider_ids_in_contexts_scope = { dict_to_ind_items[x] for x in to_consider_ids_in_contexts_scope } ind_to_dict_users = {i: x for i, x in enumerate(users_metadata)} dict_to_ind_users = {v: k for k, v in ind_to_dict_users.iteritems()} considered_users_1_sorted = [ dict([(users_id, dict_to_ind_users[x[users_id]])] + [(k, v) for k, v in x.items() if k != users_id]) for x in considered_users_1_sorted ] considered_users_2_sorted = [ dict([(users_id, dict_to_ind_users[x[users_id]])] + [(k, v) for k, v in x.items() if k != users_id]) for x in considered_users_2_sorted ] considered_users_sorted = [ dict([(users_id, dict_to_ind_users[x[users_id]])] + [(k, v) for k, v in x.items() if k != users_id]) for x in considered_users_sorted ] all_users_to_items_outcomes = { dict_to_ind_users[u]: {dict_to_ind_items[v]: o for v, o in u_votes.iteritems()} for u, u_votes in all_users_to_items_outcomes.iteritems() } items_metadata = { row[items_id]: row for row in considered_items_sorted } users_metadata = { dict_to_ind_users[u]: dict([(users_id, dict_to_ind_users[x[users_id]])] + [(k, v) for k, v in x.items() if k != users_id]) for u, x in users_metadata.iteritems() } # print vector_of_action # raw_input('**************') else: items_metadata = { row[items_id]: row for row in considered_items_sorted } process_outcome_dataset.STATS = { 'nb_items_entities': nb_itemsets_all_context, 'nb_items_individuals': nb_itemsets_all_individuals } return items_metadata, users_metadata, all_users_to_items_outcomes, outcomes_considered, items_id, users_id, considered_items_sorted, considered_users_1_sorted, considered_users_2_sorted, nb_outcome_considered, vector_of_action, to_consider_ids_in_contexts_scope
def Alpha_Entry_Point(itemsFile,usersFile,reviewsFile,numeric_attrs=[],array_attrs=None,outcome_attrs=None,method_aggregation_outcome='VECTOR_VALUES',itemsScope=[],users_1_Scope=[],users_2_Scope=[],delimiter='\t', description_attributes_items=[],description_attributes_users=[], comparaison_measure='MAAD',qualityMeasure='DISAGR_SUMDIFF',nb_items=float('inf'),nb_individuals=float('inf'),threshold_comparaison=30,threshold_nb_users_1=10,threshold_nb_users_2=10,quality_threshold=0.3, ponderation_attribute=None,bound_type=1,pruning=True,closed=True,do_heuristic_contexts=False,do_heuristic_peers=False,timebudget=1000,results_destination='.//results.csv',attributes_to_consider=None,heatmap_for_matrix=False,algorithm='DSC+CLOSED+UB2',nb_items_entities=float('inf'),nb_items_individuals=float('inf'),symmetry=True,nb_random_walks=30,hmt_to_itemset=False,debug=False,verbose=False): import pickle method_aggregation_outcome='SYMBOLIC_MAJORITY' inited=time() # try: # with open ('.//TMP.tmp', 'rb') as fp: # items_metadata,users_metadata,all_users_to_items_outcomes,outcomes_considered,items_id_attribute,users_id_attribute,considered_items_sorted,considered_users_1_sorted,considered_users_2_sorted,nb_outcome_considered=pickle.load(fp) # except Exception as e: print 'computing from zero' items_metadata,users_metadata,all_users_to_items_outcomes,outcomes_considered,items_id_attribute,users_id_attribute,considered_items_sorted,considered_users_1_sorted,considered_users_2_sorted,nb_outcome_considered =\ process_outcome_dataset(itemsFile,usersFile,reviewsFile,numeric_attrs=numeric_attrs,array_attrs=array_attrs,outcome_attrs=outcome_attrs,method_aggregation_outcome=method_aggregation_outcome,itemsScope=itemsScope,users_1_Scope=users_1_Scope,users_2_Scope=users_2_Scope,nb_items=nb_items,nb_individuals=nb_individuals,attributes_to_consider=attributes_to_consider,delimiter=delimiter) # with open('.//TMP.tmp', 'wb') as fp: # pickle.dump([items_metadata,users_metadata,all_users_to_items_outcomes,outcomes_considered,items_id_attribute,users_id_attribute,considered_items_sorted,considered_users_1_sorted,considered_users_2_sorted,nb_outcome_considered], fp) print considered_items_sorted[0] if False: considered_items_sorted=[x for x in considered_items_sorted if x[items_id_attribute] not in {'0002','0012','0022','0032','0042','0052','0062','0072'}] print len(considered_items_sorted) #raw_input('...') desc_indiv=[{'dimensionName':'CIGARETTE','inSet':{'Je fume'}},{'dimensionName':'SEXE','inSet':{'M'}},{'dimensionName':'AGE','inInterval':[0,25]}][:1] groups_indiv,_=filter_pipeline_obj(considered_users_1_sorted,desc_indiv) print len(groups_indiv) groups_indiv=set(x[users_id_attribute] for x in groups_indiv) set_of_entities=set(x[items_id_attribute] for x in considered_items_sorted) all_users_to_items_outcomes={u:{x:y for x,y in v.iteritems() if x in set_of_entities} for u,v in all_users_to_items_outcomes.iteritems()} matrix_by_e,matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome=coincidence_matrix(all_users_to_items_outcomes,individuals_set=groups_indiv) full_reliability,full_sums_votes=compute_reliability(matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,set_of_entities,distance_function=distance_numerical) print 'reliablitiy',full_reliability enumerator_contexts=enumerator_complex_cbo_init_new_config(considered_items_sorted, [{'name':'odors', 'type':'themes'}],threshold=1) results=[] for e_p,e_label,e_config in enumerator_contexts: context_pat=pattern_printer(e_label,['themes']) items_context=set(x[items_id_attribute] for x in e_config['support']) try: context_reliablitiy,_=compute_reliability(matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,items_context,distance_function=distance_numerical,nb_votes_casted_by_outcome_sums=full_sums_votes) print e_p,context_reliablitiy raw_input('......') except Exception as e: continue results.append([context_pat,context_reliablitiy,items_context]) results=sorted(results,key=lambda x : x[1],reverse=False) to_ret=[] for elem in results: print elem[0],elem[1] to_ret.append({'pattern':elem[0],'reliability':elem[1]}) #raw_input('...') #raw_input('...') #to_ret=[] if True: frenchdeputies,_=filter_pipeline_obj(considered_users_1_sorted,[{'dimensionName':'COUNTRY','inSet':{'Greece'}}])#[{'dimensionName':'GROUPE_ID','inSet':{'S&D'}}])#[{'dimensionName':'GROUPE_ID','inSet':{'ECR'}}]) #[{'dimensionName':'NATIONAL_PARTY','inSet':{'Parti socialiste'}}] frenchdeputies_metadata=sorted(frenchdeputies,key=lambda x:x[users_id_attribute]) #Parti socialiste for row in frenchdeputies_metadata: row['NATIONAL_PARTY']=unicodedata.normalize('NFD', unicode(str(row['NATIONAL_PARTY']),'iso-8859-1')).encode('ascii', 'ignore') row['NAME_FULL']=unicodedata.normalize('NFD', unicode(str(row['NAME_FULL']),'iso-8859-1')).encode('ascii', 'ignore') frenchdeputies=set(x[users_id_attribute] for x in frenchdeputies) print len(frenchdeputies) set_of_entities=set(x[items_id_attribute] for x in considered_items_sorted) #raw_input('...') #print considered_items_sorted[0] if True: print 'Starting filter' all_users_to_items_outcomes={u:{x:y for x,y in v.iteritems() if y[0]!='Abstain'} for u,v in all_users_to_items_outcomes.iteritems()} print 'Ending filter' #print 'cohesion',compute_cohesion(all_users_to_items_outcomes,set_of_entities,frenchdeputies) matrix_by_e,matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome=coincidence_matrix(all_users_to_items_outcomes,individuals_set=frenchdeputies) full_reliability,full_sums_votes=compute_reliability(matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,set_of_entities) print 'reliablitiy',full_reliability #compute_reliability_with_bootstraping_KILEM_GWET(matrix_by_e,matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,set_of_entities,distance_function=distance_nominal) compute_reliability_with_bootstraping_KILEM_GWET_BLB(matrix_by_e,matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,set_of_entities,distance_function=distance_nominal) # for x,y,z in enumerator_complex_from_dataset_new_config(considered_items_sorted, [{'name':'VOTEID', 'type':'nominal'}],threshold=5): # print x # raw_input('...') enumerator_contexts=enumerator_complex_cbo_init_new_config(considered_items_sorted, [{'name':'PROCEDURE_SUBJECT', 'type':'themes'}],threshold=5) results=[] index=0 for e_p,e_label,e_config in enumerator_contexts: index+=1 context_pat=pattern_printer(e_label,['themes']) items_context=set(x[items_id_attribute] for x in e_config['support']) #try: context_reliablitiy,_=compute_reliability(matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,items_context,nb_votes_casted_by_outcome_sums=full_sums_votes) my_reliability,_=compute_reliability_NEW(matrix,individuals_for_each_entitiy,nb_votes_casted_by_outcome,items_context,nb_votes_casted_by_outcome_sums=full_sums_votes) cohesion,vector_full=compute_cohesion(all_users_to_items_outcomes,items_context,frenchdeputies) if context_reliablitiy>0.85 and False: from heatmap.heatmap import generateHeatMap from clusters.hierarchicalClustering import drawDendrogramme,applyHierarchiqueClusteringFromDataset,hierarchicalClusteringFromDataset print context_reliablitiy ref_matrix=similarity_dictionnary(all_users_to_items_outcomes,frenchdeputies,set_of_entities) rower=[x['NATIONAL_PARTY']+'_'+x['NAME_FULL']+'_'+str(x[users_id_attribute]) for x in frenchdeputies_metadata] head=[['']]+rower ref_matrix_array=[[ref_matrix[u1][u2] if u1<u2 else ref_matrix[u2][u1] if u1>u2 else float(1.) for u2 in sorted(ref_matrix)] for u1 in sorted(ref_matrix)] for k in range(len(ref_matrix_array)): ref_matrix_array[k].insert(0,rower[k]) ref_matrix_array.insert(0,head) #ref_matrix_array_after_reorganizing=generateHeatMap(sim_matrix_array,'./Figures/REF_heatmap_'+str(index)+'.png',vmin=0.,vmax=1.,showvalues_text=False,only_heatmap=True,organize=True) sim_matrix=similarity_dictionnary(all_users_to_items_outcomes,frenchdeputies,items_context) rower=[x['NATIONAL_PARTY']+'_'+x['NAME_FULL']+'_'+str(x[users_id_attribute]) for x in frenchdeputies_metadata] head=[['']]+rower dist_matrix_array=[[1.-sim_matrix[u1][u2] if u1<u2 else 1.-sim_matrix[u2][u1] if u1>u2 else float(0.) for u2 in sorted(sim_matrix)] for u1 in sorted(sim_matrix)] sim_matrix_array=[[sim_matrix[u1][u2] if u1<u2 else sim_matrix[u2][u1] if u1>u2 else float(1.) for u2 in sorted(sim_matrix)] for u1 in sorted(sim_matrix)] for k in range(len(dist_matrix_array)): dist_matrix_array[k].insert(0,rower[k]) sim_matrix_array[k].insert(0,rower[k]) dist_matrix_array.insert(0,head) sim_matrix_array.insert(0,head) # print similarity_dictionnary(all_users_to_items_outcomes,frenchdeputies,items_context) # raw_input('...') #clusteringResults,clusters,linkageMatrix=applyHierarchiqueClusteringFromDataset(frenchdeputies_metadata,dist_matrix_array,parameter=0.4) hierarchicalClusteringFromDataset(frenchdeputies_metadata,dist_matrix_array,dendrogrammeDestination='./Figures/fig_'+str(index)+'.png',parameter=0.4,label_dendrogramme='NAME_FULL') writeCSV(dist_matrix_array,'./Figures/matrix_'+str(index)+'.csv',delimiter='\t') # except Exception as e: # raise e # continue cp_matrice_pattern=generateHeatMap(sim_matrix_array,'./Figures/heatmap_'+str(index)+'.png',vmin=0.,vmax=1.,showvalues_text=False,only_heatmap=True,organize=True) cp_matrice_pattern,ref_matrix_array=reorganize_similarly(cp_matrice_pattern,ref_matrix_array) generateHeatMap(ref_matrix_array,'./Figures/REF_heatmap_'+str(index)+'.png',vmin=0.,vmax=1.,showvalues_text=False,only_heatmap=True,organize=False) #print context_pat,'context reliablitiy',context_reliablitiy results.append([index,context_pat,context_reliablitiy,cohesion,items_context,vector_full,my_reliability]) #results.append(['*',full_reliability,set_of_entities]) results=sorted(results,key=lambda x : x[2],reverse=False) to_ret=[] for elem in results: #print elem[0],elem[1] to_ret.append({'index':elem[0],'pattern':elem[1],'contextSize':len(elem[4]),'reliability':elem[2],'cohesion':elem[3],'full_vector':elem[5],'myreliability':elem[6]}) # for c_1 in matrix: # for c_2 in matrix[c_1]: # print c_1,c_2,sum(matrix[c_1][c_2][e] for e in elem[2]&matrix[c_1][c_2].viewkeys()) # for i in frenchdeputies: # print i, [all_users_to_items_outcomes[i][e] for e in sorted(elem[2]) if e in all_users_to_items_outcomes[i]] #raw_input('...') #gc.collect() return to_ret
def workflowStage_iteratorsOnMultipeAttributes_subgroupBitwise_subgroups_tests( inputs={}, configuration={}, outputs={}, workflow_stats={}): ''' { 'id':'stage_id', 'type':'multiple_attributes_iterator_sgbitwise_subgroups', 'inputs': { 'dataset':[] 'attributes':[ { 'name' : name_attribute, 'type' : 'themes' | 'numeric' | 'nominal', 'depthmax': None | 'or a value' }, ... ] }, 'configuration': { 'skip_list':[], 'bitwise':[] }, 'outputs':{ 'yielded_item':'', 'yielded_index':'' } } ''' votes_attributes = inputs['votes_attributes'] users_attributes = inputs['users_attributes'] position_attribute = inputs['position_attribute'] vector_of_outcome = configuration.get('vector_of_outcome', None) user1_scope = inputs['user_1_scope'] user2_scope = inputs['user_2_scope'] ####################" dataset = inputs['dataset'] reviews_dataset = configuration['reviews_dataset'] items_dataset = configuration['items_dataset'] users_dataset = configuration['users_dataset'] filteredDataset1 = filter_pipeline_obj(dataset, user1_scope)[0] filteredDataset2 = filter_pipeline_obj(dataset, user2_scope)[0] filteredDataset = filteredDataset1 + filteredDataset2 if ( user1_scope <> user2_scope) else filteredDataset1 configuration_execution = inputs['XP'] print '' final_stats = [] considered = [ 'attr_items_range', 'attr_users_range', 'upperbound', 'attr_aggregates_range', 'nb_items_range', 'nb_users_range', 'sigma_user_range', 'sigma_agg_range', 'sigma_item_range', 'sigma_quality_range', 'top_k_range', 'prunning_range', 'closed_range', 'similarity_measures', 'quality_measures' ] total = 1 for key in considered: total *= len(configuration_execution[key]) index = 0 actu_config = [] last_config = [] last_time_spent = 0 time_threshold = 3600 start = time() stdout.write('\rPercentage Done : ' + '%.2f' % (index * 100 / float(total)) + ' %\t' + 'Time elapsed : ' + '%.2f' % (time() - start) + 's') for closed_inst in configuration_execution['closed_range']: for prunning_inst in configuration_execution['prunning_range']: if prunning_inst and not closed_inst: continue for attr_item_inst in configuration_execution['attr_items_range']: for attr_users_inst in configuration_execution[ 'attr_users_range']: for attr_users_agg_inst in configuration_execution[ 'attr_aggregates_range']: for nb_items_inst in configuration_execution[ 'nb_items_range']: for nb_users_inst in configuration_execution[ 'nb_users_range']: for sigma_items_inst in configuration_execution[ 'sigma_item_range']: for sigma_users_inst in configuration_execution[ 'sigma_user_range']: for sigma_agg_users_inst in configuration_execution[ 'sigma_agg_range']: for sim_measure_inst in configuration_execution[ 'similarity_measures']: for quality_measure_inst in configuration_execution[ 'quality_measures']: for sigma_quality_inst in configuration_execution[ 'sigma_quality_range']: for top_k_inst in configuration_execution[ 'top_k_range']: upperbound_range = configuration_execution[ 'upperbound'] if not prunning_inst: upperbound_range = [ 1 ] for upperboundType in upperbound_range: index += 1 input_new_attributes = [ { 'name': tupe[ 0], 'type': tupe[1] } for tupe in attr_item_inst ] input_new_attributes += [ { 'name': tupe[ 0], 'type': tupe[1] } for tupe in attr_users_inst ] conf_new = { 'nb_dossiers_min': 1, 'threshold_pair_comparaison': sigma_items_inst, 'cover_threshold': configuration[ 'cover_threshold'], #SEE AGAINS HOW TO DECLARE CONDITIONS ON FREQUENT PATTERN, 'quality_threshold': sigma_quality_inst, 'top_k': top_k_inst, 'iwant': quality_measure_inst, 'upperbound': upperboundType, 'closed': closed_inst, 'pruning': prunning_inst, 'nb_items': nb_items_inst, 'nb_users': nb_users_inst, 'aggregation_attributes_user1': attr_users_agg_inst, #['ageGroup'],#['NATIONAL_PARTY'], 'aggregation_attributes_user2': attr_users_agg_inst, #['ageGroup'],#['NATIONAL_PARTY'], 'nb_aggergation_min_user1': sigma_agg_users_inst, 'threshold_nb_users_1': sigma_users_inst, 'nb_aggergation_min_user2': sigma_agg_users_inst, 'threshold_nb_users_2': sigma_users_inst, 'comparaison_measure': sim_measure_inst, 'reviews_dataset': reviews_dataset, 'items_dataset': items_dataset, 'users_dataset': users_dataset, 'vector_of_outcome': vector_of_outcome } actu_config = [ conf_new[ 'closed'], conf_new[ 'pruning'], conf_new[ 'upperbound'] ] if actu_config == last_config and last_time_spent >= time_threshold: continue attr_users_nb = len([ x for x in input_new_attributes if x['name'] in users_attributes ]) # if attr_users_nb==0: # iterator=dsc_c_method(filteredDataset,input_new_attributes,conf_new,user1_scope,user2_scope,votes_attributes,users_attributes,position_attribute) # else: # iterator=dsc_uuc_method(filteredDataset,input_new_attributes,conf_new,user1_scope,user2_scope,votes_attributes,users_attributes,position_attribute) # iterator = DSCFORPERF( filteredDataset, input_new_attributes, conf_new, user1_scope, user2_scope, votes_attributes, users_attributes, position_attribute ) #next(iterator) for pattern, description, pairwiseStatistics, quality, upper_bound, dossiers_voted in iterator: outputs[ 'yielded_item'] = pattern outputs[ 'yielded_description'] = description outputs[ 'yielded_index'] = index outputs[ 'quality'] = quality outputs[ 'upper_bound'] = True if index == 1 else False outputs[ 'dossiers_voted'] = dossiers_voted #outputs['yielded_bitwise']=bitwise outputs[ 'pairwiseStatistics'] = pairwiseStatistics #yield outputs final_stats = final_stats + dossiers_voted #print dossiers_voted yield dossiers_voted last_config = actu_config[:] last_time_spent = dossiers_voted[ 0]['#timespent'] stdout.write( '\rPercentage Done : ' + '%.2f' % (index * 100 / float( total) ) + ' %\t' + 'Time elapsed : ' + '%.2f' % (time() - start) + 's') outputs['dossiers_voted'] = final_stats
def extractStatistics_fromdataset_vectors(dataset,votesAttributes,usersAttributes,position_attribute,user1_scope=[],user2_scope=[]): vote_identifier=str(votesAttributes[0]) user_identifier=str(usersAttributes[0]) user_1_identifier='USER1' user_2_identifier='USER2' mepsStats={} mepsMeta={} listsOfVotes={} listsOfVotesHasKey=listsOfVotes.has_key mapping_user_vote={'for':'Y','against':'N','abstain':'A'} users_map_details={} users_map_details_has_key=users_map_details.has_key for obj in dataset : vote_id=str(obj[vote_identifier]) if not (listsOfVotesHasKey(vote_id)) : listsOfVotes[vote_id]=[] listsOfVotes[vote_id].append(obj) d_user_id=obj[user_identifier] if (not users_map_details_has_key(d_user_id)): users_map_details[d_user_id]={key:obj[key] for key in usersAttributes} users_map_details_array=users_map_details.values() users_map_details_array_filtered_user1=filter_pipeline_obj(users_map_details_array, user1_scope)[0] users_map_details_array_filtered_user2=filter_pipeline_obj(users_map_details_array, user2_scope)[0] users1_ids=set([x[user_identifier] for x in users_map_details_array_filtered_user1]) users2_ids=set([x[user_identifier] for x in users_map_details_array_filtered_user2]) nb_votes_all=len(listsOfVotes) iterValues=listsOfVotes.iteritems() for vote_id,actualVote in iterValues: for mep1_object in actualVote : if mep1_object[user_identifier] in users1_ids : mep1_index=str(mep1_object[user_identifier]) try: pairwiseStatsRowOfMep1=mepsStats[mep1_index] except KeyError: mepsStats[mep1_index],mepsMeta[mep1_index]={},{attribute_user:mep1_object[attribute_user] for attribute_user in usersAttributes} pairwiseStatsRowOfMep1=mepsStats[mep1_index] for mep2_object in actualVote: if mep2_object[user_identifier] in users2_ids : mep2_index=str(mep2_object[user_identifier]) try: pairOfMeps=pairwiseStatsRowOfMep1[mep2_index] except KeyError : pairwiseStatsRowOfMep1[mep2_index]={user_1_identifier : mep1_index,user_2_identifier : mep2_index,'NB_VOTES':0,'**':{},'ALL_VOTES':nb_votes_all,'FLAGCOMPUTED':False} #FLAGCOMPUTED if computed then informations in stats matrix are vector similarity results and not vectos themselves mepsMeta[mep2_index]={attribute_user:mep2_object[attribute_user] for attribute_user in usersAttributes} pairOfMeps=pairwiseStatsRowOfMep1[mep2_index] vector_mepwise12=(mep1_object[position_attribute],mep2_object[position_attribute]) pairOfMeps['**'][vote_id] = vector_mepwise12 pairOfMeps['NB_VOTES']=pairOfMeps['NB_VOTES']+1 ################TO ADD EMPTY SET FOR PAIRS WHO HAD NEVER VOTED TOGETHER############## for mep1_index in users1_ids: for mep2_index in users2_ids: if mepsStats.has_key(mep1_index) and not mepsStats[mep1_index].has_key(mep2_index): mepsStats[mep1_index][mep2_index]={user_1_identifier : mep1_index,user_2_identifier : mep2_index,'NB_VOTES':0,'**':{},'ALL_VOTES':nb_votes_all,'FLAGCOMPUTED':False} mepsStats[mep1_index][mep2_index]['KEYS']=set(mepsStats[mep1_index][mep2_index]['**']) ################TO ADD EMPTY SET FOR PAIRS WHO HAD NEVER VOTED TOGETHER############## return mepsStats,mepsMeta