def compile(self): ''' Compile the text files to DDStorm modules. ''' self.source = set() self.custom = set() self.alias = Alias(self._conf) # Loop over library files and add *.txt files to source for path, subdirs, files in os.walk(self._conf.get("library_path")): for name in files: if (fnmatch(name, "*.txt")): self.source.add(os.path.join(path, name)) # Loop over custom files and add *.txt files to custom for path, subdirs, files in os.walk(self._conf.get("custom_path")): for name in files: if (fnmatch(name, "*.txt")): self.custom.add(os.path.join(path, name)) # Create module directory if not already present and delete all module files if (not os.path.isdir(self._conf.get("module_path"))): os.makedirs(self._conf.get("module_path")) for f in os.listdir(self._conf.get("module_path")): if (fnmatch(f, "*.module")): os.unlink(self._conf.get("module_path") + f) # Create a regex for calculating priority from filename self.priorityRegex = re.compile("(?<=\.)\d+$") # First sort files by priority then compile them to module for src in self._sortPriority(self.source): self._makeModule(src) for src in self._sortPriority(self.custom): self._makeModule(src)
def _build_alias(self): template = Environment( loader=FileSystemLoader(self.TEMPLATES_PATH)).get_template( self.VARIABLE_TEMPLATE) alias_map = Alias(self._base_path).get_map() with open('_alias.py', 'wb') as f: f.write( template.render( dict(varName='g_aliasMap', value=repr(alias_map)))) return VariablesLengthHelper(alias_map)
def parseMSOffice2011Plist(self, mru_file): plist = self.load_bplist(mru_file) if plist == None: return [] aliases = [] try: for n,item in enumerate(plist["14\File MRU\MSWD"]): aliases.append(Alias(data=item["File Alias"]).parse()) except: pass try: for n,item in enumerate(plist["14\File MRU\XCEL"]): aliases.append(Alias(data=item["File Alias"]).parse()) except: pass try: for n,item in enumerate(plist["14\File MRU\PPT3"]): aliases.append(Alias(data=item["File Alias"]).parse()) except: pass return aliases
def parseSidebarlistsPlist(self, mru_file): plist = self.load_bplist(mru_file) if plist == None: return [] aliases = [] try: for n,item in enumerate(plist["systemitems"]['VolumesList']): try: aliases.append(Alias(data=plist["systemitems"]['VolumesList'][n]['Alias']).parse()) except Exception as e: pass except: pass try: for n,item in enumerate(plist["favorites"]['VolumesList']): try: pass aliases.append(Alias(data=plist["systemitems"]['VolumesList'][n]['Alias']).parse()) except: pass except: pass return aliases
def parseRecentItemsPlist(self, mru_file): plist = self.load_bplist(mru_file) if plist == None: return [] bookmarks = [] aliases = [] try: for n,item in enumerate(plist["RecentApplications"]["CustomListItems"]): bookmarks.append(Bookmark(data=item["Bookmark"]).parse()) except: pass try: for n,item in enumerate(plist["RecentDocuments"]["CustomListItems"]): bookmarks.append(Bookmark(data=item["Bookmark"]).parse()) except: pass try: for n,item in enumerate(plist["RecentServers"]["CustomListItems"]): bookmarks.append(Bookmark(data=item["Bookmark"]).parse()) except: pass try: for n,item in enumerate(plist["Applications"]["CustomListItems"]): aliases.append(Alias(data=item["Alias"]).parse()) except: pass try: for n,item in enumerate(plist["Documents"]["CustomListItems"]): aliases.append(Alias(data=item["Alias"]).parse()) except: pass try: for n,item in enumerate(plist["Servers"]["CustomListItems"]): aliases.append(Alias(data=item["Alias"]).parse()) except: pass return bookmarks, aliases
def parseFinderPlist(self, mru_file): plist = self.load_bplist(mru_file) if plist == None: return [] bookmarks = [] aliases = [] try: for n,item in enumerate(plist["FXRecentFolders"]): try: bookmarks.append(Bookmark(data=item["file-bookmark"]).parse()) except: pass try: pass aliases.append(Alias(data=item["file-data"]["_CFURLAliasData"]).parse()) except: pass except: pass return bookmarks, aliases
class Profile(namedtuple('Profile', 'v e iv ie sim')): _aliases = Alias(set) _profiles = { 'JdbcTableScan': tablescan, 'JdbcProjectRel': projection, 'JdbcFilterRel': selection, 'JdbcJoinRel': join, 'JdbcAggregateRel': aggregate, 'JdbcToEnumerableConverter': jdbctoenumerate } @classmethod def _antialias(cls, columns): return set(flat(map(cls._aliases, columns))) @classmethod def build(cls, node, inputs): return cls._profiles[node.get('relOp')](node, inputs)
def add_alias(self, new_cpp_type_name, old_cpp_type_name): try: direct_new_cpp_global_expr = self.cpp_type_expr_parser.parse( new_cpp_type_name).prefix(self.components) direct_old_cpp_global_expr = self.resolve_cpp_type_expr( old_cpp_type_name) self.type_mgr.add_alias(direct_new_cpp_global_expr, direct_old_cpp_global_expr) direct_new_kl_local_name = new_cpp_type_name direct_new_kl_global_name = '_'.join(self.nested_kl_names + [direct_new_kl_local_name]) direct_old_dqti = self.type_mgr.get_dqti( direct_old_cpp_global_expr) print "direct_old_dqti.type_info.kl.name = " + str( direct_old_dqti.type_info.kl.name) print "direct_old_dqti.type_info.edk.name = " + str( direct_old_dqti.type_info.edk.name) print "direct_old_dqti.type_info.lib.name = " + str( direct_old_dqti.type_info.lib.name) print "direct_old_dqti.type_info.lib.expr = " + str( direct_old_dqti.type_info.lib.expr) direct_alias = Alias(self, direct_new_kl_global_name, direct_old_dqti.type_info) self.ext.add_decl(direct_alias) const_ptr_new_cpp_type_expr = PointerTo( Const(direct_new_cpp_global_expr)) const_ptr_old_cpp_type_expr = PointerTo( Const(direct_old_cpp_global_expr)) self.type_mgr.add_alias(const_ptr_new_cpp_type_expr, const_ptr_old_cpp_type_expr) const_ptr_new_kl_type_name = direct_new_kl_global_name + "_CxxConstPtr" const_ptr_old_dqti = self.type_mgr.get_dqti( const_ptr_old_cpp_type_expr) const_ptr_old_kl_type_name = const_ptr_old_dqti.type_info.kl.name.compound const_ptr_alias = Alias(self, const_ptr_new_kl_type_name, const_ptr_old_dqti.type_info) self.ext.add_decl(const_ptr_alias) self.ext.add_kl_epilog(""" %s Make_%s(%s value) { return Make_%s(value); } %s Make_%s(io %s value) { return Make_%s(value); } """ % ( const_ptr_new_kl_type_name, const_ptr_new_kl_type_name, direct_new_kl_global_name, const_ptr_old_kl_type_name, const_ptr_new_kl_type_name, const_ptr_new_kl_type_name, direct_new_kl_global_name, const_ptr_old_kl_type_name, )) mutable_ptr_new_cpp_type_expr = PointerTo( direct_new_cpp_global_expr) mutable_ptr_old_cpp_type_expr = PointerTo( direct_old_cpp_global_expr) self.type_mgr.add_alias(mutable_ptr_new_cpp_type_expr, mutable_ptr_old_cpp_type_expr) mutable_ptr_new_kl_type_name = direct_new_kl_global_name + "_CxxPtr" mutable_ptr_old_dqti = self.type_mgr.get_dqti( mutable_ptr_old_cpp_type_expr) mutable_ptr_old_kl_type_name = mutable_ptr_old_dqti.type_info.kl.name.compound mutable_ptr_alias = Alias(self, mutable_ptr_new_kl_type_name, mutable_ptr_old_dqti.type_info) self.ext.add_decl(mutable_ptr_alias) self.ext.add_kl_epilog(""" %s Make_%s(%s value) { return Make_%s(value); } %s Make_%s(io %s value) { return Make_%s(value); } """ % ( mutable_ptr_new_kl_type_name, mutable_ptr_new_kl_type_name, direct_new_kl_global_name, mutable_ptr_old_kl_type_name, mutable_ptr_new_kl_type_name, mutable_ptr_new_kl_type_name, direct_new_kl_global_name, mutable_ptr_old_kl_type_name, )) const_ref_new_cpp_type_expr = ReferenceTo( Const(direct_new_cpp_global_expr)) const_ref_old_cpp_type_expr = ReferenceTo( Const(direct_old_cpp_global_expr)) self.type_mgr.add_alias(const_ref_new_cpp_type_expr, const_ref_old_cpp_type_expr) const_ref_new_kl_type_name = direct_new_kl_global_name + "_CxxConstRef" const_ref_old_dqti = self.type_mgr.get_dqti( const_ref_old_cpp_type_expr) const_ref_old_kl_type_name = const_ref_old_dqti.type_info.kl.name.compound const_ref_alias = Alias(self, const_ref_new_kl_type_name, const_ref_old_dqti.type_info) self.ext.add_decl(const_ref_alias) self.ext.add_kl_epilog(""" %s Make_%s(%s value) { return Make_%s(value); } %s Make_%s(io %s value) { return Make_%s(value); } """ % ( const_ref_new_kl_type_name, const_ref_new_kl_type_name, direct_new_kl_global_name, const_ref_old_kl_type_name, const_ref_new_kl_type_name, const_ref_new_kl_type_name, direct_new_kl_global_name, const_ref_old_kl_type_name, )) mutable_ref_new_cpp_type_expr = ReferenceTo( direct_new_cpp_global_expr) mutable_ref_old_cpp_type_expr = ReferenceTo( direct_old_cpp_global_expr) self.type_mgr.add_alias(mutable_ref_new_cpp_type_expr, mutable_ref_old_cpp_type_expr) mutable_ref_new_kl_type_name = direct_new_kl_global_name + "_CxxRef" mutable_ref_old_dqti = self.type_mgr.get_dqti( mutable_ref_old_cpp_type_expr) mutable_ref_old_kl_type_name = mutable_ref_old_dqti.type_info.kl.name.compound mutable_ref_alias = Alias(self, mutable_ref_new_kl_type_name, mutable_ref_old_dqti.type_info) self.ext.add_decl(mutable_ref_alias) self.ext.add_kl_epilog(""" %s Make_%s(%s value) { return Make_%s(value); } %s Make_%s(io %s value) { return Make_%s(value); } """ % ( mutable_ref_new_kl_type_name, mutable_ref_new_kl_type_name, direct_new_kl_global_name, mutable_ref_old_kl_type_name, mutable_ref_new_kl_type_name, mutable_ref_new_kl_type_name, direct_new_kl_global_name, mutable_ref_old_kl_type_name, )) return direct_alias except Exception as e: self.ext.warning("Ignoring alias '%s': %s" % (new_cpp_type_name, e)) return EmptyCommentContainer()
uid = row[0] login = row[1].strip() name = row[2] user_type = row[7].strip() location = row[4] email = row[5] unmask[uid] = uid m = fakeusr_rex.search(login) if m is not None: record_type = USR_FAKE else: record_type = USR_REAL a = Alias(record_type, uid, login, name, email, location, user_type) aliases[uid] = a # - email d_uid_email[a.uid] = a.email if a.email is not None: d_email_uid.setdefault(a.email, set([a.uid])) d_email_uid[a.email].add(a.uid) # - prefix d_uid_prefix[a.uid] = a.email_prefix d_uid_comp_prefix[a.uid] = a.email_prefix if a.email_prefix is not None: if len(a.email_prefix.split('.')) > 1 or len( a.email_prefix.split('_')) > 1: d_comp_prefix_uid.setdefault(a.email_prefix, set([a.uid]))
def resolve_aliases(slug, inputs): print_flag = 0 #out = open("merge_dump/"+slug.replace("/", "_____"), "w") unmask = {} aliases = {} # Helper structures d_email_uid = {} d_uid_email = {} d_prefix_uid = {} d_uid_prefix = {} d_comp_prefix_uid = {} d_uid_comp_prefix = {} d_uid_domain = {} d_domain_uid = {} d_name_uid = {} d_uid_name = {} d_name_parts_uid = {} d_uid_name_parts = {} d_name_app_uid = {} d_uid_app_parts = {} d_login_uid = {} d_uid_login = {} d_location_uid = {} d_uid_location = {} d_uid_type = {} #d_type_usr = {} uid = 0 # raw = {} for ind, row in inputs.iterrows(): uid = row["id"] name = row["name"] email = row["email"] # raw[uid] = line login = row["login"] #None #row[1].strip() if row["type"] == None: user_type = "" else: user_type = str(row["type"]) #None if row["location"] == None: location = "" else: location = str(row["location"]) #None # try: # name = line.split('<')[0].strip() # email = line.split('<')[1].strip().split('>')[0].strip() # except: # print line # exit() unmask[uid] = uid if row["record_type"] == 1: record_type = USR_REAL else: record_type = USR_FAKE # m = fakeusr_rex.search(login) # if m is not None: # record_type = USR_FAKE # else: # record_type = USR_REAL a = Alias(record_type, uid, login, name, email, location, user_type) aliases[uid] = a # - email d_uid_email[a.uid] = a.email if a.email is not None: d_email_uid.setdefault(a.email, set([a.uid])) d_email_uid[a.email].add(a.uid) # - prefix d_uid_prefix[a.uid] = a.email_prefix d_uid_comp_prefix[a.uid] = a.email_prefix if a.email_prefix is not None: if len(a.email_prefix.split('.')) > 1 or len( a.email_prefix.split('_')) > 1: d_comp_prefix_uid.setdefault(a.email_prefix, set([a.uid])) d_comp_prefix_uid[a.email_prefix].add(a.uid) else: d_prefix_uid.setdefault(a.email_prefix, set([a.uid])) d_prefix_uid[a.email_prefix].add(a.uid) # - domain d_uid_domain[a.uid] = a.email_domain if a.email_domain is not None: d_domain_uid.setdefault(a.email_domain, set([a.uid])) d_domain_uid[a.email_domain].add(a.uid) # - login d_uid_login[a.uid] = a.login if a.login is not None: d_login_uid.setdefault(a.login, set([a.uid])) d_login_uid[a.login].add(a.uid) if a.record_type == USR_REAL: d_login_uid.setdefault(a.login.lower(), set([a.uid])) d_login_uid[a.login.lower()].add(a.uid) # type d_uid_type[a.uid] = a.usr_type # - name d_uid_name[a.uid] = a.name if a.name is not None and len(a.name): d_name_uid.setdefault(a.name, set([a.uid])) d_name_uid[a.name].add(a.uid) if len(a.name.split(' ')) == 1: d_name_uid.setdefault(a.name.lower(), set([a.uid])) d_name_uid[a.name.lower()].add(a.uid) # janejohnson -> janejohnson # we need this for matching d_name_app_uid.setdefault(a.name.lower(), set([a.uid])) d_name_app_uid[a.name.lower()].add(a.uid) # jane johnson -> janejohnson d_name_app_uid.setdefault("".join(a.name.split(" ")).lower(), set([a.uid])) d_name_app_uid["".join(a.name.split(" ")).lower()].add(a.uid) if "@" in a.name: # otherwise it will make "gmail", "com" as names name_subpart = a.name.split("@")[0] d_name_parts_uid.setdefault(name_subpart.lower(), set([a.uid])) d_name_parts_uid[name_subpart.lower()].add(a.uid) else: # xiyi ji -> ji xiyi name_parts_split = a.name.lower().replace(",", " ").replace( ".", " ").split(' ') if len(name_parts_split) != 2: continue new_name_parts = name_parts_split[-1] + " " + name_parts_split[ 0] d_name_parts_uid.setdefault(new_name_parts, set([a.uid])) d_name_parts_uid[new_name_parts].add(a.uid) # - location d_uid_location[a.uid] = a.location if a.location is not None and len(a.location): d_location_uid.setdefault(a.location, set([a.uid])) d_location_uid[a.location].add(a.uid) # idx += 1 # if idx >= curidx: # print curidx/step # curidx += step # print 'Done: helpers' clues = {} for email, set_uid in d_email_uid.items(): if len(set_uid) > THR_MIN: for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(EMAIL) # print a,b,EMAIL # print 'Done: email' for prefix, set_uid in d_comp_prefix_uid.items(): if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: if len(prefix) >= 3: for a, b in combinations( sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(COMP_EMAIL_PREFIX) # print a,b,COMP_EMAIL_PREFIX # print 'Done: comp email prefix' for prefix, set_uid in d_prefix_uid.items(): if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: if len(prefix) >= 3: for a, b in combinations( sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(SIMPLE_EMAIL_PREFIX) # print a,b,SIMPLE_EMAIL_PREFIX # print 'Done: email prefix' for prefix in set(d_prefix_uid.keys()).intersection(set( d_login_uid.keys())): if len(d_prefix_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_login_uid[prefix], key=lambda uid: int(uid)), sorted(d_prefix_uid[prefix], key=lambda uid: int(uid))): if a < b: clues.setdefault((a, b), []) if not SIMPLE_EMAIL_PREFIX in clues[(a, b)]: clues[(a, b)].append(PREFIX_LOGIN) # print a,b,PREFIX_LOGIN # print 'Done: prefix=login' for prefix in set(d_prefix_uid.keys()).intersection(set( d_name_uid.keys())): if len(d_prefix_uid[prefix]) < THR_MAX and len( d_name_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_name_uid[prefix], key=lambda uid: int(uid)), sorted(d_prefix_uid[prefix], key=lambda uid: int(uid))): if a < b: clues.setdefault((a, b), []) if not SIMPLE_EMAIL_PREFIX in clues[(a, b)]: clues[(a, b)].append(PREFIX_NAME) # print 'Done: prefix=name' for prefix in set(d_login_uid.keys()).intersection(set(d_name_uid.keys())): if len(d_name_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_name_uid[prefix], key=lambda uid: int(uid)), sorted(d_login_uid[prefix], key=lambda uid: int(uid))): if a < b: clues.setdefault((a, b), []) if not SIMPLE_EMAIL_PREFIX in clues[(a, b)]: clues[(a, b)].append(LOGIN_NAME) # print 'Done: login=name' for name, set_uid in d_name_uid.items(): if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: if len(name.split(' ')) > 1: for a, b in combinations( sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(FULL_NAME) else: for a, b in combinations( sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(SIMPLE_NAME) # print 'Done: full/simple name' for name, set_uid in d_name_parts_uid.items(): #out.write(name + "," + str(set_uid) + "\n") if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(NAME_PARTS) #out.write("\n") # print 'Done: name parts' for name, set_uid in d_name_app_uid.items(): #out.write(name + "," + str(set_uid)) if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(NAME_APPENDED) # print 'Done: name parts appended' for domain, set_uid in d_domain_uid.items(): if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(DOMAIN) # print 'Done: email domain' for location, set_uid in d_location_uid.items(): if len(set_uid) > THR_MIN: for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)), 2): na = d_uid_name[a] nb = d_uid_name[b] if na is not None and nb is not None and len( na.split()) > 1 and na == nb: if len(d_name_uid.get(na, set([]))) < THR_MAX: clues.setdefault((a, b), []) clues[(a, b)].append(LOCATION) # print 'Done: location' d_alias_map = {} clusters = {} labels = {} def merge(a, b, rule): # Contract: a < b assert a < b, "A must be less than B" if a in d_alias_map: if b in d_alias_map: if d_alias_map[a] == d_alias_map[b]: labels[d_alias_map[a]].append(rule) else: lowest = min(d_alias_map[a], d_alias_map[b]) highest = max(d_alias_map[a], d_alias_map[b]) labels[lowest].extend(labels[highest]) labels[lowest].append(rule) clusters[lowest].update(clusters[highest]) for x in clusters[highest]: d_alias_map[x] = lowest del labels[highest] del clusters[highest] d_alias_map[a] = lowest d_alias_map[b] = lowest else: # a is an alias; first time I see b d_alias_map[b] = d_alias_map[a] clusters[d_alias_map[a]].add(b) labels[d_alias_map[a]].append(rule) else: if b in d_alias_map: #b_src = d_alias_map[b] # b_src < a by construction d_alias_map[a] = d_alias_map[b] clusters[d_alias_map[b]].add(a) labels[d_alias_map[b]].append(rule) else: # First time I see this pair (guaranteed sorted) d_alias_map[a] = a d_alias_map[b] = a clusters[a] = set([a, b]) labels[a] = [rule] for (a, b), list_clues in sorted(clues.items(), key=lambda e: (int(e[0][0]), int(e[0][1]))): if print_flag: print(((a, b), list_clues)) aa = aliases[a] ab = aliases[b] if EMAIL in list_clues: merge(a, b, EMAIL) elif len(set(list_clues)) >= 2: for clue in set(list_clues): merge(a, b, clue) # merge(a,b,TWO) elif FULL_NAME in list_clues: merge(a, b, FULL_NAME) elif NAME_APPENDED in list_clues: merge(a, b, NAME_APPENDED) elif NAME_PARTS in list_clues: merge(a, b, NAME_PARTS) elif COMP_EMAIL_PREFIX in list_clues: merge(a, b, COMP_EMAIL_PREFIX) elif SIMPLE_NAME in list_clues: merge(a, b, SIMPLE_NAME) elif PREFIX_NAME in list_clues: merge(a, b, PREFIX_NAME) # print 'Done: clusters' for uid, member_uids in clusters.items(): # print ((uid, member_uids)) members = [aliases[m] for m in member_uids] # Count fake/real c = Counter([m.record_type for m in members]) real = [m for m in members if m.record_type == USR_REAL] with_location = [m for m in real if m.location is not None] fake = [m for m in members if m.record_type == USR_FAKE] # Count rules that fired cl = Counter(labels[uid]) if print_flag: print(cl) is_valid = False # If all have the same email there is no doubt if cl.get(EMAIL, 0) >= (len(members) - 1): is_valid = True # If all the REALs have the same email, assume all the FAKEs are this REAL elif len(Counter([m.email for m in real]).keys()) == 1: is_valid = True # If there is at most one real, at least two rules fired, and each rule applied to each pair elif len(cl.keys()) > 1 and min(cl.values()) >= (len(members) - 1): is_valid = True # At most one real, the only rule that fired is COMP_EMAIL_PREFIX or FULL_NAME elif len(cl.keys()) == 1 and \ (cl.get(COMP_EMAIL_PREFIX,0) or cl.get(FULL_NAME,0) or \ cl.get(NAME_PARTS,0) or cl.get(NAME_APPENDED,0)): is_valid = True # All with same full name and location / same full name and email domain elif cl.get(FULL_NAME,0) >= (len(members)-1) and \ (cl.get(LOCATION,0) >= (len(members)-1) or cl.get(DOMAIN,0) >= (len(members)-1)): is_valid = True # All same composite email prefix / same full name elif (cl.get(COMP_EMAIL_PREFIX, 0) >= (len(members) - 1) or cl.get(FULL_NAME, 0) >= (len(members) - 1)): is_valid = True elif cl.get(NAME_APPENDED, 0) >= (len(members) - 1): is_valid = True elif cl.get(FULL_NAME, 0) >= (len(members) - 1): is_valid = True # The only two rules that fired are full name and email, in some combination elif len(cl.keys()) == 2 and cl.get(FULL_NAME, 0) > 0 and cl.get( EMAIL, 0) > 0: is_valid = True elif len(cl.keys()) == 3 and cl.get(FULL_NAME, 0) > 0 and cl.get( EMAIL, 0) > 0 and cl.get(SIMPLE_NAME, 0) > 0: is_valid = True elif len(cl.keys()) == 2 and cl.get(EMAIL, 0) > 0 and cl.get( SIMPLE_NAME, 0) > 0: is_valid = True elif cl.get(PREFIX_NAME, 0) > 0: is_valid = True elif cl.get(SIMPLE_NAME,0) > 0 and cl.get(FULL_NAME,0) > 0 \ and cl.get(SIMPLE_EMAIL_PREFIX,0) > 0 and cl.get(EMAIL,0) > 0: is_valid = True elif cl.get(SIMPLE_NAME, 0) > 0: is_valid = True elif cl.get(NAME_PARTS, 0) >= (len(members) - 1): is_valid = True else: # is_valid = True # continue # Split by email address if at least 2 share one if cl.get(EMAIL, 0): ce = [ e for e, c in Counter([m.email for m in members]).items() if c > 1 ] for e in ce: extra_members = [m for m in members if m.email == e] extra_real = [ m for m in extra_members if m.record_type == USR_REAL ] extra_with_location = [ m for m in extra_real if m.location is not None ] if len(extra_real): if len(extra_with_location): # Pick the one with the oldest account with location, if available rep = sorted(extra_with_location, key=lambda m: int(m.uid))[0] else: # Otherwise pick the one with the oldest account rep = sorted(extra_real, key=lambda m: int(m.uid))[0] else: rep = sorted(extra_members, key=lambda m: int(m.uid))[0] # w_log.writerow([]) # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location]) for a in extra_members: if a.uid != rep.uid: # w_log.writerow([a.uid, a.login, a.name, a.email, a.location]) # writer.writerow([a.uid, rep.uid]) unmask[a.uid] = rep.uid # print ('Mapped:' + str((a.uid, rep.uid))) # w_maybe.writerow([]) # w_maybe.writerow([str(cl.items())]) if print_flag: print(str(cl.items())) for m in members: print([m.uid, m.name, m.email]) # w_maybe.writerow([m.uid, m.login, m.name, m.email, m.location]) if is_valid: # Determine group representative if len(real): if len(with_location): # Pick the one with the oldest account with location, if available rep = sorted(with_location, key=lambda m: int(m.uid))[0] else: # Otherwise pick the one with the oldest account rep = sorted(real, key=lambda m: int(m.uid))[0] else: rep = sorted(members, key=lambda m: int(m.uid))[0] # w_log.writerow([]) # w_log.writerow([str(cl.items())]) # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location]) for a in members: if a.uid != rep.uid: # w_log.writerow([a.uid, a.login, a.name, a.email, a.location]) # writer.writerow([a.uid, rep.uid]) unmask[a.uid] = rep.uid if print_flag: print('Mapped:' + str((a.uid, rep.uid))) return unmask
def main (args, app): new_alias = Alias(name=args.name, command=args.command) new_alias.store(app.config) app.config.save( ) print "added", new_alias.format_url( )
def add_alias(self, left: str, right: str): inst = Alias("{}_{}".format(left, right), self, self.get_net(right), self.get_net(left)) self._instances["{}_{}".format(left, right)] = inst return inst
def main(input_dir_path: str, out_dir_path: str): log.info("Input dir: %s; out_dir: %s", input_dir_path, out_dir_path) try: out_dir = os.path.abspath(out_dir_path) except IndexError: out_dir = os.path.abspath('./') out_dir = os.path.join(out_dir, 'idm') os.makedirs(out_dir, exist_ok=True) os.makedirs(os.path.join(out_dir, 'dict'), exist_ok=True) fakeusr_rex = regex.compile(r'\A[A-Z]{8}$') unmask = {} w_log = CsvWriter(csv_file=os.path.join(out_dir, 'idm_log.csv')) writer = CsvWriter(csv_file=os.path.join(out_dir, 'idm_map.csv')) w_maybe = CsvWriter(csv_file=os.path.join(out_dir, 'idm_maybe.csv')) idx = 0 step = 100000 curidx = step aliases = {} # Helper structures d_email_uid = {} d_uid_email = {} d_prefix_uid = {} d_uid_prefix = {} d_comp_prefix_uid = {} d_uid_comp_prefix = {} d_uid_domain = {} d_domain_uid = {} d_name_uid = {} d_uid_name = {} d_login_uid = {} d_uid_login = {} #df = pd.read_csv(input_dir_path, index_col=False, na_filter=False) df = utility.read_from_folder(input_dir_path, "*contributors.csv") users = [ SzzContributor(getattr(row, "CONTRIBUTOR_ID"), getattr(row, "NAME"), getattr(row, "EMAIL")) for row in df.itertuples(index=False) ] log.info("Users to parse: %d", len(users)) for user in users: uid = user.id login = user.name name = user.name email = user.email if name is "github" and email is "*****@*****.**": continue unmask[uid] = uid m = fakeusr_rex.search(login) if m is not None: record_type = USR_FAKE else: record_type = USR_REAL # a = Alias(record_type, uid, login, name, email, location, user_type) a = Alias(record_type, uid, login, name, email) aliases[uid] = a # - email d_uid_email[a.uid] = a.email if a.email is not None: d_email_uid.setdefault(a.email, {a.uid}) d_email_uid[a.email].add(a.uid) # - prefix d_uid_prefix[a.uid] = a.email_prefix d_uid_comp_prefix[a.uid] = a.email_prefix if a.email_prefix is not None: if len(a.email_prefix.split('.')) > 1 or len( a.email_prefix.split('_')) > 1: d_comp_prefix_uid.setdefault(a.email_prefix, {a.uid}) d_comp_prefix_uid[a.email_prefix].add(a.uid) else: d_prefix_uid.setdefault(a.email_prefix, {a.uid}) d_prefix_uid[a.email_prefix].add(a.uid) # - domain d_uid_domain[a.uid] = a.email_domain if a.email_domain is not None: d_domain_uid.setdefault(a.email_domain, {a.uid}) d_domain_uid[a.email_domain].add(a.uid) # - login d_uid_login[a.uid] = a.login if a.login is not None: d_login_uid.setdefault(a.login, set([a.uid])) d_login_uid[a.login].add(a.uid) if a.record_type == USR_REAL: d_login_uid.setdefault(a.login.lower(), set([a.uid])) d_login_uid[a.login.lower()].add(a.uid) # - name d_uid_name[a.uid] = a.name if a.name is not None and len(a.name): d_name_uid.setdefault(a.name, {a.uid}) d_name_uid[a.name].add(a.uid) if len(a.name.split(' ')) == 1: d_name_uid.setdefault(a.name.lower(), {a.uid}) d_name_uid[a.name.lower()].add(a.uid) idx += 1 if idx >= curidx: log.info(curidx / step, '/ 30') curidx += step log.info('Done: helpers') clues = {} for email, set_uid in d_email_uid.items(): if len(set_uid) > THR_MIN: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(EMAIL) log.info('Done: email') for prefix, set_uid in d_comp_prefix_uid.items(): if THR_MIN < len(set_uid) < THR_MAX: if len(prefix) >= 3: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(COMP_EMAIL_PREFIX) log.info('Done: comp email prefix') for prefix, set_uid in d_prefix_uid.items(): if THR_MIN < len(set_uid) < THR_MAX: if len(prefix) >= 3: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(SIMPLE_EMAIL_PREFIX) log.info('Done: email prefix') for prefix in set(d_prefix_uid.keys()).intersection(set( d_login_uid.keys())): if len(d_prefix_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_login_uid[prefix], key=lambda uid: uid), sorted(d_prefix_uid[prefix], key=lambda uid: uid)): if a < b: clues.setdefault((a, b), []) if SIMPLE_EMAIL_PREFIX not in clues[(a, b)]: clues[(a, b)].append(PREFIX_LOGIN) log.info('Done: prefix=login') for prefix in set(d_prefix_uid.keys()).intersection(set( d_name_uid.keys())): if len(d_prefix_uid[prefix]) < THR_MAX and len( d_name_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_name_uid[prefix], key=lambda uid: uid), sorted(d_prefix_uid[prefix], key=lambda uid: uid)): if a < b: clues.setdefault((a, b), []) if SIMPLE_EMAIL_PREFIX not in clues[(a, b)]: clues[(a, b)].append(PREFIX_NAME) log.info('Done: prefix=name') for prefix in set(d_login_uid.keys()).intersection(set(d_name_uid.keys())): if len(d_name_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_name_uid[prefix], key=lambda uid: uid), sorted(d_login_uid[prefix], key=lambda uid: uid)): if a < b: clues.setdefault((a, b), []) if SIMPLE_EMAIL_PREFIX not in clues[(a, b)]: clues[(a, b)].append(LOGIN_NAME) log.info('Done: login=name') # print d_name_uid.items() for name, set_uid in d_name_uid.items(): if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: if len(name.split(' ')) > 1: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(FULL_NAME) # print a,b,FULL_NAME else: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(SIMPLE_NAME) log.info('Done: full/simple name') for domain, set_uid in d_domain_uid.items(): if THR_MIN < len(set_uid) < THR_MAX: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(DOMAIN) log.info('Done: email domain') for (a, b), list_clues in sorted(clues.items(), key=lambda e: (e[0][0], e[0][1])): if EMAIL in list_clues: merge(a, b, EMAIL) elif len(list_clues) >= 2: for clue in list_clues: merge(a, b, clue) elif FULL_NAME in list_clues: merge(a, b, FULL_NAME) elif COMP_EMAIL_PREFIX in list_clues: merge(a, b, COMP_EMAIL_PREFIX) log.info('Done: clusters') for uid, member_uids in clusters.items(): members = [aliases[m] for m in member_uids] # Count fake/real real = [m for m in members if m.record_type == USR_REAL] # with_location = [m for m in real if m.location is not None] # Count rules that fired cl = Counter(labels[uid]) is_valid = False # If all have the same email there is no doubt if cl.get(EMAIL, 0) >= (len(members) - 1): is_valid = True # If all the REALs have the same email, assume all the FAKEs are this REAL elif len(Counter([m.email for m in real]).keys()) == 1: is_valid = True # If there is at most one real, at least two rules fired, and each rule applied to each pair elif len(real) <= 1 and len(cl.keys()) > 1 and min( cl.values()) >= (len(members) - 1): is_valid = True # At most one real, the only rule that fired is COMP_EMAIL_PREFIX or FULL_NAME elif len(real) <= 1 and len(cl.keys()) == 1 and \ (cl.get(COMP_EMAIL_PREFIX, 0) or cl.get(FULL_NAME, 0)): is_valid = True # All with same full name and location / same full name and email domain elif cl.get(FULL_NAME, 0) >= (len(members) - 1) and \ (cl.get(LOCATION, 0) >= (len(members) - 1) or cl.get(DOMAIN, 0) >= (len(members) - 1)): is_valid = True # All fake and same composite email prefix / same full name elif len(real) == 0 and \ (cl.get(COMP_EMAIL_PREFIX, 0) >= (len(members) - 1) or cl.get(FULL_NAME, 0) >= (len(members) - 1)): is_valid = True else: # Split by email address if at least 2 share one if cl.get(EMAIL, 0): ce = [ e for e, c in Counter([m.email for m in members]).items() if c > 1 ] for e in ce: extra_members = [m for m in members if m.email == e] # extra_with_location = [m for m in extra_real if m.location is not None] # if len(extra_real): # if len(extra_with_location): # # Pick the one with the oldest account with location, if available # rep = sorted(extra_with_location, key=lambda m: int(m.uid))[0] # else: # # Otherwise pick the one with the oldest account # rep = sorted(extra_real, key=lambda m: int(m.uid))[0] # else: rep = sorted(extra_members, key=lambda m: m.uid)[0] w_log.writerow([]) # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location]) w_log.writerow([rep.uid, rep.name, rep.email]) for a in extra_members: if a.uid != rep.uid: # w_log.writerow([a.uid, a.login, a.name, a.email, a.location]) w_log.writerow([a.uid, a.name, a.email]) writer.writerow([a.uid, rep.uid]) unmask[a.uid] = rep.uid # -- added: Write also maybes to the alias map rep = sorted(members, key=lambda m: m.uid)[0] # -- end w_maybe.writerow([]) w_maybe.writerow([str(cl.items())]) for m in members: # -- added: added Write also maybes to the alias map if m.uid != rep.uid: unmask[m.uid] = rep.uid writer.writerow([m.uid, rep.uid]) # -- end # w_maybe.writerow([m.uid, m.login, m.name, m.email, m.location]) w_maybe.writerow([m.uid, m.name, m.email]) if is_valid: # Determine group representative # if len(real): # if len(with_location): # # Pick the one with the oldest account with location, if available # rep = sorted(with_location, key=lambda m: int(m.uid))[0] # else: # # Otherwise pick the one with the oldest account # rep = sorted(real, key=lambda m: int(m.uid))[0] # else: rep = sorted(members, key=lambda m: m.uid)[0] w_log.writerow([]) w_log.writerow([str(cl.items())]) # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location]) w_log.writerow([rep.uid, rep.name, rep.email]) for a in members: if a.uid != rep.uid: # w_log.writerow([a.uid, a.login, a.name, a.email, a.location]) w_log.writerow([a.uid, a.name, a.email]) writer.writerow([a.uid, rep.uid]) unmask[a.uid] = rep.uid log.info("Unmasked size: %d", len(unmask)) pickle.dump(unmask, open(os.path.join(out_dir, 'dict', 'aliasMap.dict'), 'wb'))