Example #1
0
    def compile(self):
        ''' Compile the text files to DDStorm modules. '''
        self.source = set()
        self.custom = set()
        self.alias = Alias(self._conf)

        # Loop over library files and add *.txt files to source
        for path, subdirs, files in os.walk(self._conf.get("library_path")):
            for name in files:
                if (fnmatch(name, "*.txt")):
                    self.source.add(os.path.join(path, name))

        # Loop over custom files and add *.txt files to custom
        for path, subdirs, files in os.walk(self._conf.get("custom_path")):
            for name in files:
                if (fnmatch(name, "*.txt")):
                    self.custom.add(os.path.join(path, name))

        # Create module directory if not already present and delete all module files
        if (not os.path.isdir(self._conf.get("module_path"))):
            os.makedirs(self._conf.get("module_path"))
        for f in os.listdir(self._conf.get("module_path")):
            if (fnmatch(f, "*.module")):
                os.unlink(self._conf.get("module_path") + f)

        # Create a regex for calculating priority from filename
        self.priorityRegex = re.compile("(?<=\.)\d+$")

        # First sort files by priority then compile them to module
        for src in self._sortPriority(self.source):
            self._makeModule(src)
        for src in self._sortPriority(self.custom):
            self._makeModule(src)
 def _build_alias(self):
     template = Environment(
         loader=FileSystemLoader(self.TEMPLATES_PATH)).get_template(
             self.VARIABLE_TEMPLATE)
     alias_map = Alias(self._base_path).get_map()
     with open('_alias.py', 'wb') as f:
         f.write(
             template.render(
                 dict(varName='g_aliasMap', value=repr(alias_map))))
     return VariablesLengthHelper(alias_map)
Example #3
0
 def parseMSOffice2011Plist(self, mru_file):
     plist = self.load_bplist(mru_file)
     if plist == None:
         return []
     aliases = []
     try:
         for n,item in enumerate(plist["14\File MRU\MSWD"]):
             aliases.append(Alias(data=item["File Alias"]).parse())
     except:
         pass
     try:
         for n,item in enumerate(plist["14\File MRU\XCEL"]):
             aliases.append(Alias(data=item["File Alias"]).parse())
     except:
         pass
     try:
         for n,item in enumerate(plist["14\File MRU\PPT3"]):
             aliases.append(Alias(data=item["File Alias"]).parse())
     except:
         pass
     return aliases
Example #4
0
 def parseSidebarlistsPlist(self, mru_file):
     plist = self.load_bplist(mru_file)
     if plist == None:
         return []
     aliases = []
     try:
         for n,item in enumerate(plist["systemitems"]['VolumesList']):
             try:
                 aliases.append(Alias(data=plist["systemitems"]['VolumesList'][n]['Alias']).parse())
             except Exception as e:
                 pass
     except:
        pass
     try:
         for n,item in enumerate(plist["favorites"]['VolumesList']):
             try:
                 pass
                 aliases.append(Alias(data=plist["systemitems"]['VolumesList'][n]['Alias']).parse())
             except:
                 pass
     except:
        pass
     return aliases
Example #5
0
 def parseRecentItemsPlist(self, mru_file):
     plist = self.load_bplist(mru_file)
     if plist == None:
         return []
     bookmarks = []
     aliases = []
     try:
         for n,item in enumerate(plist["RecentApplications"]["CustomListItems"]):
             bookmarks.append(Bookmark(data=item["Bookmark"]).parse())
     except:
         pass
     try:
         for n,item in enumerate(plist["RecentDocuments"]["CustomListItems"]):
             bookmarks.append(Bookmark(data=item["Bookmark"]).parse())
     except:
         pass
     try:
         for n,item in enumerate(plist["RecentServers"]["CustomListItems"]):
             bookmarks.append(Bookmark(data=item["Bookmark"]).parse())
     except:
         pass
     try:
         for n,item in enumerate(plist["Applications"]["CustomListItems"]):
             aliases.append(Alias(data=item["Alias"]).parse())
     except:
         pass
     try:
         for n,item in enumerate(plist["Documents"]["CustomListItems"]):
             aliases.append(Alias(data=item["Alias"]).parse())
     except:
         pass
     try:
         for n,item in enumerate(plist["Servers"]["CustomListItems"]):
             aliases.append(Alias(data=item["Alias"]).parse())
     except:
         pass
     return bookmarks, aliases
Example #6
0
 def parseFinderPlist(self, mru_file):
     plist = self.load_bplist(mru_file)
     if plist == None:
         return []
     bookmarks = []
     aliases = []
     try:
         for n,item in enumerate(plist["FXRecentFolders"]):
             try:
                 bookmarks.append(Bookmark(data=item["file-bookmark"]).parse())
             except:
                 pass
             try:
                 pass
                 aliases.append(Alias(data=item["file-data"]["_CFURLAliasData"]).parse())
             except:
                 pass
     except:
         pass
     return bookmarks, aliases
Example #7
0
class Profile(namedtuple('Profile', 'v e iv ie sim')):

    _aliases = Alias(set)

    _profiles = {
        'JdbcTableScan': tablescan,
        'JdbcProjectRel': projection,
        'JdbcFilterRel': selection,
        'JdbcJoinRel': join,
        'JdbcAggregateRel': aggregate,
        'JdbcToEnumerableConverter': jdbctoenumerate
    }

    @classmethod
    def _antialias(cls, columns):
        return set(flat(map(cls._aliases, columns)))

    @classmethod
    def build(cls, node, inputs):
        return cls._profiles[node.get('relOp')](node, inputs)
Example #8
0
    def add_alias(self, new_cpp_type_name, old_cpp_type_name):
        try:
            direct_new_cpp_global_expr = self.cpp_type_expr_parser.parse(
                new_cpp_type_name).prefix(self.components)
            direct_old_cpp_global_expr = self.resolve_cpp_type_expr(
                old_cpp_type_name)
            self.type_mgr.add_alias(direct_new_cpp_global_expr,
                                    direct_old_cpp_global_expr)
            direct_new_kl_local_name = new_cpp_type_name
            direct_new_kl_global_name = '_'.join(self.nested_kl_names +
                                                 [direct_new_kl_local_name])
            direct_old_dqti = self.type_mgr.get_dqti(
                direct_old_cpp_global_expr)
            print "direct_old_dqti.type_info.kl.name = " + str(
                direct_old_dqti.type_info.kl.name)
            print "direct_old_dqti.type_info.edk.name = " + str(
                direct_old_dqti.type_info.edk.name)
            print "direct_old_dqti.type_info.lib.name = " + str(
                direct_old_dqti.type_info.lib.name)
            print "direct_old_dqti.type_info.lib.expr = " + str(
                direct_old_dqti.type_info.lib.expr)
            direct_alias = Alias(self, direct_new_kl_global_name,
                                 direct_old_dqti.type_info)
            self.ext.add_decl(direct_alias)

            const_ptr_new_cpp_type_expr = PointerTo(
                Const(direct_new_cpp_global_expr))
            const_ptr_old_cpp_type_expr = PointerTo(
                Const(direct_old_cpp_global_expr))
            self.type_mgr.add_alias(const_ptr_new_cpp_type_expr,
                                    const_ptr_old_cpp_type_expr)
            const_ptr_new_kl_type_name = direct_new_kl_global_name + "_CxxConstPtr"
            const_ptr_old_dqti = self.type_mgr.get_dqti(
                const_ptr_old_cpp_type_expr)
            const_ptr_old_kl_type_name = const_ptr_old_dqti.type_info.kl.name.compound
            const_ptr_alias = Alias(self, const_ptr_new_kl_type_name,
                                    const_ptr_old_dqti.type_info)
            self.ext.add_decl(const_ptr_alias)
            self.ext.add_kl_epilog("""
%s Make_%s(%s value) {
  return Make_%s(value);
}

%s Make_%s(io %s value) {
  return Make_%s(value);
}
""" % (
                const_ptr_new_kl_type_name,
                const_ptr_new_kl_type_name,
                direct_new_kl_global_name,
                const_ptr_old_kl_type_name,
                const_ptr_new_kl_type_name,
                const_ptr_new_kl_type_name,
                direct_new_kl_global_name,
                const_ptr_old_kl_type_name,
            ))

            mutable_ptr_new_cpp_type_expr = PointerTo(
                direct_new_cpp_global_expr)
            mutable_ptr_old_cpp_type_expr = PointerTo(
                direct_old_cpp_global_expr)
            self.type_mgr.add_alias(mutable_ptr_new_cpp_type_expr,
                                    mutable_ptr_old_cpp_type_expr)
            mutable_ptr_new_kl_type_name = direct_new_kl_global_name + "_CxxPtr"
            mutable_ptr_old_dqti = self.type_mgr.get_dqti(
                mutable_ptr_old_cpp_type_expr)
            mutable_ptr_old_kl_type_name = mutable_ptr_old_dqti.type_info.kl.name.compound
            mutable_ptr_alias = Alias(self, mutable_ptr_new_kl_type_name,
                                      mutable_ptr_old_dqti.type_info)
            self.ext.add_decl(mutable_ptr_alias)
            self.ext.add_kl_epilog("""
%s Make_%s(%s value) {
  return Make_%s(value);
}

%s Make_%s(io %s value) {
  return Make_%s(value);
}
""" % (
                mutable_ptr_new_kl_type_name,
                mutable_ptr_new_kl_type_name,
                direct_new_kl_global_name,
                mutable_ptr_old_kl_type_name,
                mutable_ptr_new_kl_type_name,
                mutable_ptr_new_kl_type_name,
                direct_new_kl_global_name,
                mutable_ptr_old_kl_type_name,
            ))

            const_ref_new_cpp_type_expr = ReferenceTo(
                Const(direct_new_cpp_global_expr))
            const_ref_old_cpp_type_expr = ReferenceTo(
                Const(direct_old_cpp_global_expr))
            self.type_mgr.add_alias(const_ref_new_cpp_type_expr,
                                    const_ref_old_cpp_type_expr)
            const_ref_new_kl_type_name = direct_new_kl_global_name + "_CxxConstRef"
            const_ref_old_dqti = self.type_mgr.get_dqti(
                const_ref_old_cpp_type_expr)
            const_ref_old_kl_type_name = const_ref_old_dqti.type_info.kl.name.compound
            const_ref_alias = Alias(self, const_ref_new_kl_type_name,
                                    const_ref_old_dqti.type_info)
            self.ext.add_decl(const_ref_alias)
            self.ext.add_kl_epilog("""
%s Make_%s(%s value) {
  return Make_%s(value);
}

%s Make_%s(io %s value) {
  return Make_%s(value);
}
""" % (
                const_ref_new_kl_type_name,
                const_ref_new_kl_type_name,
                direct_new_kl_global_name,
                const_ref_old_kl_type_name,
                const_ref_new_kl_type_name,
                const_ref_new_kl_type_name,
                direct_new_kl_global_name,
                const_ref_old_kl_type_name,
            ))

            mutable_ref_new_cpp_type_expr = ReferenceTo(
                direct_new_cpp_global_expr)
            mutable_ref_old_cpp_type_expr = ReferenceTo(
                direct_old_cpp_global_expr)
            self.type_mgr.add_alias(mutable_ref_new_cpp_type_expr,
                                    mutable_ref_old_cpp_type_expr)
            mutable_ref_new_kl_type_name = direct_new_kl_global_name + "_CxxRef"
            mutable_ref_old_dqti = self.type_mgr.get_dqti(
                mutable_ref_old_cpp_type_expr)
            mutable_ref_old_kl_type_name = mutable_ref_old_dqti.type_info.kl.name.compound
            mutable_ref_alias = Alias(self, mutable_ref_new_kl_type_name,
                                      mutable_ref_old_dqti.type_info)
            self.ext.add_decl(mutable_ref_alias)
            self.ext.add_kl_epilog("""
%s Make_%s(%s value) {
  return Make_%s(value);
}

%s Make_%s(io %s value) {
  return Make_%s(value);
}
""" % (
                mutable_ref_new_kl_type_name,
                mutable_ref_new_kl_type_name,
                direct_new_kl_global_name,
                mutable_ref_old_kl_type_name,
                mutable_ref_new_kl_type_name,
                mutable_ref_new_kl_type_name,
                direct_new_kl_global_name,
                mutable_ref_old_kl_type_name,
            ))

            return direct_alias
        except Exception as e:
            self.ext.warning("Ignoring alias '%s': %s" %
                             (new_cpp_type_name, e))
            return EmptyCommentContainer()
    uid = row[0]
    login = row[1].strip()
    name = row[2]
    user_type = row[7].strip()
    location = row[4]
    email = row[5]

    unmask[uid] = uid

    m = fakeusr_rex.search(login)
    if m is not None:
        record_type = USR_FAKE
    else:
        record_type = USR_REAL

    a = Alias(record_type, uid, login, name, email, location, user_type)
    aliases[uid] = a

    # - email
    d_uid_email[a.uid] = a.email
    if a.email is not None:
        d_email_uid.setdefault(a.email, set([a.uid]))
        d_email_uid[a.email].add(a.uid)

    # - prefix
    d_uid_prefix[a.uid] = a.email_prefix
    d_uid_comp_prefix[a.uid] = a.email_prefix
    if a.email_prefix is not None:
        if len(a.email_prefix.split('.')) > 1 or len(
                a.email_prefix.split('_')) > 1:
            d_comp_prefix_uid.setdefault(a.email_prefix, set([a.uid]))
Example #10
0
def resolve_aliases(slug, inputs):
    print_flag = 0
    #out = open("merge_dump/"+slug.replace("/", "_____"), "w")

    unmask = {}
    aliases = {}

    # Helper structures
    d_email_uid = {}
    d_uid_email = {}

    d_prefix_uid = {}
    d_uid_prefix = {}

    d_comp_prefix_uid = {}
    d_uid_comp_prefix = {}

    d_uid_domain = {}
    d_domain_uid = {}

    d_name_uid = {}
    d_uid_name = {}

    d_name_parts_uid = {}
    d_uid_name_parts = {}

    d_name_app_uid = {}
    d_uid_app_parts = {}

    d_login_uid = {}
    d_uid_login = {}

    d_location_uid = {}
    d_uid_location = {}

    d_uid_type = {}
    #d_type_usr = {}

    uid = 0

    # raw = {}

    for ind, row in inputs.iterrows():
        uid = row["id"]
        name = row["name"]
        email = row["email"]
        # raw[uid] = line
        login = row["login"]  #None #row[1].strip()
        if row["type"] == None:
            user_type = ""
        else:
            user_type = str(row["type"])  #None
        if row["location"] == None:
            location = ""
        else:
            location = str(row["location"])  #None
        # try:
        #     name = line.split('<')[0].strip()
        #     email = line.split('<')[1].strip().split('>')[0].strip()
        # except:
        #     print line
        #     exit()

        unmask[uid] = uid

        if row["record_type"] == 1:
            record_type = USR_REAL
        else:
            record_type = USR_FAKE
    #     m = fakeusr_rex.search(login)
    #     if m is not None:
    #         record_type = USR_FAKE
    #     else:
    #         record_type = USR_REAL

        a = Alias(record_type, uid, login, name, email, location, user_type)
        aliases[uid] = a

        # - email
        d_uid_email[a.uid] = a.email
        if a.email is not None:
            d_email_uid.setdefault(a.email, set([a.uid]))
            d_email_uid[a.email].add(a.uid)

        # - prefix
        d_uid_prefix[a.uid] = a.email_prefix
        d_uid_comp_prefix[a.uid] = a.email_prefix
        if a.email_prefix is not None:
            if len(a.email_prefix.split('.')) > 1 or len(
                    a.email_prefix.split('_')) > 1:
                d_comp_prefix_uid.setdefault(a.email_prefix, set([a.uid]))
                d_comp_prefix_uid[a.email_prefix].add(a.uid)
            else:
                d_prefix_uid.setdefault(a.email_prefix, set([a.uid]))
                d_prefix_uid[a.email_prefix].add(a.uid)

        # - domain
        d_uid_domain[a.uid] = a.email_domain
        if a.email_domain is not None:
            d_domain_uid.setdefault(a.email_domain, set([a.uid]))
            d_domain_uid[a.email_domain].add(a.uid)

        # - login
        d_uid_login[a.uid] = a.login
        if a.login is not None:
            d_login_uid.setdefault(a.login, set([a.uid]))
            d_login_uid[a.login].add(a.uid)

            if a.record_type == USR_REAL:
                d_login_uid.setdefault(a.login.lower(), set([a.uid]))
                d_login_uid[a.login.lower()].add(a.uid)

        # type
        d_uid_type[a.uid] = a.usr_type

        # - name
        d_uid_name[a.uid] = a.name
        if a.name is not None and len(a.name):
            d_name_uid.setdefault(a.name, set([a.uid]))
            d_name_uid[a.name].add(a.uid)

            if len(a.name.split(' ')) == 1:
                d_name_uid.setdefault(a.name.lower(), set([a.uid]))
                d_name_uid[a.name.lower()].add(a.uid)

                # janejohnson -> janejohnson
                # we need this for matching
                d_name_app_uid.setdefault(a.name.lower(), set([a.uid]))
                d_name_app_uid[a.name.lower()].add(a.uid)

            # jane johnson -> janejohnson
            d_name_app_uid.setdefault("".join(a.name.split(" ")).lower(),
                                      set([a.uid]))
            d_name_app_uid["".join(a.name.split(" ")).lower()].add(a.uid)

            if "@" in a.name:  # otherwise it will make "gmail", "com" as names
                name_subpart = a.name.split("@")[0]
                d_name_parts_uid.setdefault(name_subpart.lower(), set([a.uid]))
                d_name_parts_uid[name_subpart.lower()].add(a.uid)
            else:
                # xiyi ji -> ji xiyi
                name_parts_split = a.name.lower().replace(",", " ").replace(
                    ".", " ").split(' ')
                if len(name_parts_split) != 2:
                    continue
                new_name_parts = name_parts_split[-1] + " " + name_parts_split[
                    0]
                d_name_parts_uid.setdefault(new_name_parts, set([a.uid]))
                d_name_parts_uid[new_name_parts].add(a.uid)

        # - location
        d_uid_location[a.uid] = a.location
        if a.location is not None and len(a.location):
            d_location_uid.setdefault(a.location, set([a.uid]))
            d_location_uid[a.location].add(a.uid)

        # idx += 1
        # if idx >= curidx:
        #     print curidx/step
        #     curidx += step

    # print 'Done: helpers'

    clues = {}

    for email, set_uid in d_email_uid.items():
        if len(set_uid) > THR_MIN:
            for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)),
                                     2):
                clues.setdefault((a, b), [])
                clues[(a, b)].append(EMAIL)
    #                print a,b,EMAIL

    # print 'Done: email'

    for prefix, set_uid in d_comp_prefix_uid.items():
        if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX:
            if len(prefix) >= 3:
                for a, b in combinations(
                        sorted(set_uid, key=lambda uid: int(uid)), 2):
                    clues.setdefault((a, b), [])
                    clues[(a, b)].append(COMP_EMAIL_PREFIX)
    #                    print a,b,COMP_EMAIL_PREFIX

    # print 'Done: comp email prefix'

    for prefix, set_uid in d_prefix_uid.items():
        if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX:
            if len(prefix) >= 3:
                for a, b in combinations(
                        sorted(set_uid, key=lambda uid: int(uid)), 2):
                    clues.setdefault((a, b), [])
                    clues[(a, b)].append(SIMPLE_EMAIL_PREFIX)
    #                    print a,b,SIMPLE_EMAIL_PREFIX

    # print 'Done: email prefix'

    for prefix in set(d_prefix_uid.keys()).intersection(set(
            d_login_uid.keys())):
        if len(d_prefix_uid[prefix]) < THR_MAX:
            for a, b in product(
                    sorted(d_login_uid[prefix], key=lambda uid: int(uid)),
                    sorted(d_prefix_uid[prefix], key=lambda uid: int(uid))):
                if a < b:
                    clues.setdefault((a, b), [])
                    if not SIMPLE_EMAIL_PREFIX in clues[(a, b)]:
                        clues[(a, b)].append(PREFIX_LOGIN)
    #                    print a,b,PREFIX_LOGIN

    # print 'Done: prefix=login'

    for prefix in set(d_prefix_uid.keys()).intersection(set(
            d_name_uid.keys())):
        if len(d_prefix_uid[prefix]) < THR_MAX and len(
                d_name_uid[prefix]) < THR_MAX:
            for a, b in product(
                    sorted(d_name_uid[prefix], key=lambda uid: int(uid)),
                    sorted(d_prefix_uid[prefix], key=lambda uid: int(uid))):
                if a < b:
                    clues.setdefault((a, b), [])
                    if not SIMPLE_EMAIL_PREFIX in clues[(a, b)]:
                        clues[(a, b)].append(PREFIX_NAME)

    # print 'Done: prefix=name'

    for prefix in set(d_login_uid.keys()).intersection(set(d_name_uid.keys())):
        if len(d_name_uid[prefix]) < THR_MAX:
            for a, b in product(
                    sorted(d_name_uid[prefix], key=lambda uid: int(uid)),
                    sorted(d_login_uid[prefix], key=lambda uid: int(uid))):
                if a < b:
                    clues.setdefault((a, b), [])
                    if not SIMPLE_EMAIL_PREFIX in clues[(a, b)]:
                        clues[(a, b)].append(LOGIN_NAME)

    # print 'Done: login=name'

    for name, set_uid in d_name_uid.items():
        if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX:
            if len(name.split(' ')) > 1:
                for a, b in combinations(
                        sorted(set_uid, key=lambda uid: int(uid)), 2):
                    clues.setdefault((a, b), [])
                    clues[(a, b)].append(FULL_NAME)
            else:
                for a, b in combinations(
                        sorted(set_uid, key=lambda uid: int(uid)), 2):
                    clues.setdefault((a, b), [])
                    clues[(a, b)].append(SIMPLE_NAME)

    # print 'Done: full/simple name'

    for name, set_uid in d_name_parts_uid.items():
        #out.write(name + "," + str(set_uid) + "\n")
        if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX:
            for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)),
                                     2):
                clues.setdefault((a, b), [])
                clues[(a, b)].append(NAME_PARTS)

    #out.write("\n")

    # print 'Done: name parts'

    for name, set_uid in d_name_app_uid.items():
        #out.write(name + "," + str(set_uid))
        if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX:
            for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)),
                                     2):
                clues.setdefault((a, b), [])
                clues[(a, b)].append(NAME_APPENDED)

    # print 'Done: name parts appended'

    for domain, set_uid in d_domain_uid.items():
        if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX:
            for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)),
                                     2):
                clues.setdefault((a, b), [])
                clues[(a, b)].append(DOMAIN)

    # print 'Done: email domain'

    for location, set_uid in d_location_uid.items():
        if len(set_uid) > THR_MIN:
            for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)),
                                     2):
                na = d_uid_name[a]
                nb = d_uid_name[b]
                if na is not None and nb is not None and len(
                        na.split()) > 1 and na == nb:
                    if len(d_name_uid.get(na, set([]))) < THR_MAX:
                        clues.setdefault((a, b), [])
                        clues[(a, b)].append(LOCATION)

    # print 'Done: location'

    d_alias_map = {}
    clusters = {}
    labels = {}

    def merge(a, b, rule):
        # Contract: a < b
        assert a < b, "A must be less than B"
        if a in d_alias_map:
            if b in d_alias_map:
                if d_alias_map[a] == d_alias_map[b]:
                    labels[d_alias_map[a]].append(rule)
                else:
                    lowest = min(d_alias_map[a], d_alias_map[b])
                    highest = max(d_alias_map[a], d_alias_map[b])
                    labels[lowest].extend(labels[highest])
                    labels[lowest].append(rule)
                    clusters[lowest].update(clusters[highest])
                    for x in clusters[highest]:
                        d_alias_map[x] = lowest
                    del labels[highest]
                    del clusters[highest]
                    d_alias_map[a] = lowest
                    d_alias_map[b] = lowest

            else:
                # a is an alias; first time I see b
                d_alias_map[b] = d_alias_map[a]
                clusters[d_alias_map[a]].add(b)
                labels[d_alias_map[a]].append(rule)
        else:
            if b in d_alias_map:
                #b_src = d_alias_map[b] # b_src < a by construction
                d_alias_map[a] = d_alias_map[b]
                clusters[d_alias_map[b]].add(a)
                labels[d_alias_map[b]].append(rule)
            else:
                # First time I see this pair (guaranteed sorted)
                d_alias_map[a] = a
                d_alias_map[b] = a
                clusters[a] = set([a, b])
                labels[a] = [rule]

    for (a, b), list_clues in sorted(clues.items(),
                                     key=lambda e:
                                     (int(e[0][0]), int(e[0][1]))):
        if print_flag:
            print(((a, b), list_clues))
        aa = aliases[a]
        ab = aliases[b]

        if EMAIL in list_clues:
            merge(a, b, EMAIL)
        elif len(set(list_clues)) >= 2:
            for clue in set(list_clues):
                merge(a, b, clue)
    #            merge(a,b,TWO)
        elif FULL_NAME in list_clues:
            merge(a, b, FULL_NAME)
        elif NAME_APPENDED in list_clues:
            merge(a, b, NAME_APPENDED)
        elif NAME_PARTS in list_clues:
            merge(a, b, NAME_PARTS)
        elif COMP_EMAIL_PREFIX in list_clues:
            merge(a, b, COMP_EMAIL_PREFIX)
        elif SIMPLE_NAME in list_clues:
            merge(a, b, SIMPLE_NAME)
        elif PREFIX_NAME in list_clues:
            merge(a, b, PREFIX_NAME)

    # print 'Done: clusters'

    for uid, member_uids in clusters.items():
        # print ((uid, member_uids))
        members = [aliases[m] for m in member_uids]

        # Count fake/real
        c = Counter([m.record_type for m in members])
        real = [m for m in members if m.record_type == USR_REAL]
        with_location = [m for m in real if m.location is not None]
        fake = [m for m in members if m.record_type == USR_FAKE]

        # Count rules that fired
        cl = Counter(labels[uid])
        if print_flag:
            print(cl)

        is_valid = False

        # If all have the same email there is no doubt
        if cl.get(EMAIL, 0) >= (len(members) - 1):
            is_valid = True
        # If all the REALs have the same email, assume all the FAKEs are this REAL
        elif len(Counter([m.email for m in real]).keys()) == 1:
            is_valid = True
        # If there is at most one real, at least two rules fired, and each rule applied to each pair
        elif len(cl.keys()) > 1 and min(cl.values()) >= (len(members) - 1):
            is_valid = True
        # At most one real, the only rule that fired is COMP_EMAIL_PREFIX or FULL_NAME
        elif len(cl.keys()) == 1 and \
                (cl.get(COMP_EMAIL_PREFIX,0) or cl.get(FULL_NAME,0) or \
                 cl.get(NAME_PARTS,0) or cl.get(NAME_APPENDED,0)):
            is_valid = True
        # All with same full name and location / same full name and email domain
        elif cl.get(FULL_NAME,0) >= (len(members)-1) and \
                (cl.get(LOCATION,0) >= (len(members)-1) or cl.get(DOMAIN,0) >= (len(members)-1)):
            is_valid = True
        # All same composite email prefix / same full name
        elif (cl.get(COMP_EMAIL_PREFIX, 0) >= (len(members) - 1)
              or cl.get(FULL_NAME, 0) >= (len(members) - 1)):
            is_valid = True
        elif cl.get(NAME_APPENDED, 0) >= (len(members) - 1):
            is_valid = True
        elif cl.get(FULL_NAME, 0) >= (len(members) - 1):
            is_valid = True
        # The only two rules that fired are full name and email, in some combination
        elif len(cl.keys()) == 2 and cl.get(FULL_NAME, 0) > 0 and cl.get(
                EMAIL, 0) > 0:
            is_valid = True
        elif len(cl.keys()) == 3 and cl.get(FULL_NAME, 0) > 0 and cl.get(
                EMAIL, 0) > 0 and cl.get(SIMPLE_NAME, 0) > 0:
            is_valid = True
        elif len(cl.keys()) == 2 and cl.get(EMAIL, 0) > 0 and cl.get(
                SIMPLE_NAME, 0) > 0:
            is_valid = True
        elif cl.get(PREFIX_NAME, 0) > 0:
            is_valid = True
        elif cl.get(SIMPLE_NAME,0) > 0 and cl.get(FULL_NAME,0) > 0 \
            and cl.get(SIMPLE_EMAIL_PREFIX,0) > 0 and cl.get(EMAIL,0) > 0:
            is_valid = True
        elif cl.get(SIMPLE_NAME, 0) > 0:
            is_valid = True
        elif cl.get(NAME_PARTS, 0) >= (len(members) - 1):
            is_valid = True
        else:
            # is_valid = True
            # continue
            # Split by email address if at least 2 share one
            if cl.get(EMAIL, 0):
                ce = [
                    e for e, c in Counter([m.email for m in members]).items()
                    if c > 1
                ]
                for e in ce:
                    extra_members = [m for m in members if m.email == e]
                    extra_real = [
                        m for m in extra_members if m.record_type == USR_REAL
                    ]
                    extra_with_location = [
                        m for m in extra_real if m.location is not None
                    ]

                    if len(extra_real):
                        if len(extra_with_location):
                            # Pick the one with the oldest account with location, if available
                            rep = sorted(extra_with_location,
                                         key=lambda m: int(m.uid))[0]
                        else:
                            # Otherwise pick the one with the oldest account
                            rep = sorted(extra_real,
                                         key=lambda m: int(m.uid))[0]
                    else:
                        rep = sorted(extra_members,
                                     key=lambda m: int(m.uid))[0]

                    # w_log.writerow([])
                    # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location])
                    for a in extra_members:
                        if a.uid != rep.uid:
                            # w_log.writerow([a.uid, a.login, a.name, a.email, a.location])
                            # writer.writerow([a.uid, rep.uid])
                            unmask[a.uid] = rep.uid
                            # print ('Mapped:' + str((a.uid, rep.uid)))

            # w_maybe.writerow([])
            # w_maybe.writerow([str(cl.items())])
            if print_flag:
                print(str(cl.items()))
                for m in members:
                    print([m.uid, m.name, m.email])
                # w_maybe.writerow([m.uid, m.login, m.name, m.email, m.location])

        if is_valid:
            # Determine group representative
            if len(real):
                if len(with_location):
                    # Pick the one with the oldest account with location, if available
                    rep = sorted(with_location, key=lambda m: int(m.uid))[0]
                else:
                    # Otherwise pick the one with the oldest account
                    rep = sorted(real, key=lambda m: int(m.uid))[0]
            else:
                rep = sorted(members, key=lambda m: int(m.uid))[0]

            # w_log.writerow([])
            # w_log.writerow([str(cl.items())])
            # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location])
            for a in members:
                if a.uid != rep.uid:
                    # w_log.writerow([a.uid, a.login, a.name, a.email, a.location])
                    # writer.writerow([a.uid, rep.uid])
                    unmask[a.uid] = rep.uid
                    if print_flag:
                        print('Mapped:' + str((a.uid, rep.uid)))

    return unmask
Example #11
0
def main (args, app):
  new_alias = Alias(name=args.name, command=args.command)
  new_alias.store(app.config)
  app.config.save( )
  print "added", new_alias.format_url( )
Example #12
0
 def add_alias(self, left: str, right: str):
     inst = Alias("{}_{}".format(left, right), self, self.get_net(right), self.get_net(left))
     self._instances["{}_{}".format(left, right)] = inst
     return inst
Example #13
0
def main(input_dir_path: str, out_dir_path: str):
    log.info("Input dir: %s; out_dir: %s", input_dir_path, out_dir_path)
    try:
        out_dir = os.path.abspath(out_dir_path)
    except IndexError:
        out_dir = os.path.abspath('./')
    out_dir = os.path.join(out_dir, 'idm')
    os.makedirs(out_dir, exist_ok=True)
    os.makedirs(os.path.join(out_dir, 'dict'), exist_ok=True)

    fakeusr_rex = regex.compile(r'\A[A-Z]{8}$')

    unmask = {}

    w_log = CsvWriter(csv_file=os.path.join(out_dir, 'idm_log.csv'))
    writer = CsvWriter(csv_file=os.path.join(out_dir, 'idm_map.csv'))
    w_maybe = CsvWriter(csv_file=os.path.join(out_dir, 'idm_maybe.csv'))

    idx = 0
    step = 100000
    curidx = step

    aliases = {}

    # Helper structures
    d_email_uid = {}
    d_uid_email = {}

    d_prefix_uid = {}
    d_uid_prefix = {}

    d_comp_prefix_uid = {}
    d_uid_comp_prefix = {}

    d_uid_domain = {}
    d_domain_uid = {}

    d_name_uid = {}
    d_uid_name = {}

    d_login_uid = {}
    d_uid_login = {}

    #df = pd.read_csv(input_dir_path, index_col=False, na_filter=False)
    df = utility.read_from_folder(input_dir_path, "*contributors.csv")

    users = [
        SzzContributor(getattr(row, "CONTRIBUTOR_ID"), getattr(row, "NAME"),
                       getattr(row, "EMAIL"))
        for row in df.itertuples(index=False)
    ]
    log.info("Users to parse: %d", len(users))

    for user in users:
        uid = user.id
        login = user.name
        name = user.name
        email = user.email

        if name is "github" and email is "*****@*****.**":
            continue

        unmask[uid] = uid

        m = fakeusr_rex.search(login)
        if m is not None:
            record_type = USR_FAKE
        else:
            record_type = USR_REAL

        # a = Alias(record_type, uid, login, name, email, location, user_type)
        a = Alias(record_type, uid, login, name, email)
        aliases[uid] = a

        # - email
        d_uid_email[a.uid] = a.email
        if a.email is not None:
            d_email_uid.setdefault(a.email, {a.uid})
            d_email_uid[a.email].add(a.uid)

        # - prefix
        d_uid_prefix[a.uid] = a.email_prefix
        d_uid_comp_prefix[a.uid] = a.email_prefix
        if a.email_prefix is not None:
            if len(a.email_prefix.split('.')) > 1 or len(
                    a.email_prefix.split('_')) > 1:
                d_comp_prefix_uid.setdefault(a.email_prefix, {a.uid})
                d_comp_prefix_uid[a.email_prefix].add(a.uid)
            else:
                d_prefix_uid.setdefault(a.email_prefix, {a.uid})
                d_prefix_uid[a.email_prefix].add(a.uid)

        # - domain
        d_uid_domain[a.uid] = a.email_domain
        if a.email_domain is not None:
            d_domain_uid.setdefault(a.email_domain, {a.uid})
            d_domain_uid[a.email_domain].add(a.uid)

        # - login
        d_uid_login[a.uid] = a.login
        if a.login is not None:
            d_login_uid.setdefault(a.login, set([a.uid]))
            d_login_uid[a.login].add(a.uid)

            if a.record_type == USR_REAL:
                d_login_uid.setdefault(a.login.lower(), set([a.uid]))
                d_login_uid[a.login.lower()].add(a.uid)

        # - name
        d_uid_name[a.uid] = a.name
        if a.name is not None and len(a.name):
            d_name_uid.setdefault(a.name, {a.uid})
            d_name_uid[a.name].add(a.uid)

            if len(a.name.split(' ')) == 1:
                d_name_uid.setdefault(a.name.lower(), {a.uid})
                d_name_uid[a.name.lower()].add(a.uid)

        idx += 1
        if idx >= curidx:
            log.info(curidx / step, '/ 30')
            curidx += step

    log.info('Done: helpers')

    clues = {}

    for email, set_uid in d_email_uid.items():
        if len(set_uid) > THR_MIN:
            for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2):
                clues.setdefault((a, b), [])
                clues[(a, b)].append(EMAIL)
    log.info('Done: email')

    for prefix, set_uid in d_comp_prefix_uid.items():
        if THR_MIN < len(set_uid) < THR_MAX:
            if len(prefix) >= 3:
                for a, b in combinations(sorted(set_uid, key=lambda uid: uid),
                                         2):
                    clues.setdefault((a, b), [])
                    clues[(a, b)].append(COMP_EMAIL_PREFIX)
    log.info('Done: comp email prefix')

    for prefix, set_uid in d_prefix_uid.items():
        if THR_MIN < len(set_uid) < THR_MAX:
            if len(prefix) >= 3:
                for a, b in combinations(sorted(set_uid, key=lambda uid: uid),
                                         2):
                    clues.setdefault((a, b), [])
                    clues[(a, b)].append(SIMPLE_EMAIL_PREFIX)
    log.info('Done: email prefix')

    for prefix in set(d_prefix_uid.keys()).intersection(set(
            d_login_uid.keys())):
        if len(d_prefix_uid[prefix]) < THR_MAX:
            for a, b in product(
                    sorted(d_login_uid[prefix], key=lambda uid: uid),
                    sorted(d_prefix_uid[prefix], key=lambda uid: uid)):
                if a < b:
                    clues.setdefault((a, b), [])
                    if SIMPLE_EMAIL_PREFIX not in clues[(a, b)]:
                        clues[(a, b)].append(PREFIX_LOGIN)
    log.info('Done: prefix=login')

    for prefix in set(d_prefix_uid.keys()).intersection(set(
            d_name_uid.keys())):
        if len(d_prefix_uid[prefix]) < THR_MAX and len(
                d_name_uid[prefix]) < THR_MAX:
            for a, b in product(
                    sorted(d_name_uid[prefix], key=lambda uid: uid),
                    sorted(d_prefix_uid[prefix], key=lambda uid: uid)):
                if a < b:
                    clues.setdefault((a, b), [])
                    if SIMPLE_EMAIL_PREFIX not in clues[(a, b)]:
                        clues[(a, b)].append(PREFIX_NAME)

    log.info('Done: prefix=name')

    for prefix in set(d_login_uid.keys()).intersection(set(d_name_uid.keys())):
        if len(d_name_uid[prefix]) < THR_MAX:
            for a, b in product(
                    sorted(d_name_uid[prefix], key=lambda uid: uid),
                    sorted(d_login_uid[prefix], key=lambda uid: uid)):
                if a < b:
                    clues.setdefault((a, b), [])
                    if SIMPLE_EMAIL_PREFIX not in clues[(a, b)]:
                        clues[(a, b)].append(LOGIN_NAME)
    log.info('Done: login=name')

    #    print d_name_uid.items()
    for name, set_uid in d_name_uid.items():
        if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX:
            if len(name.split(' ')) > 1:
                for a, b in combinations(sorted(set_uid, key=lambda uid: uid),
                                         2):
                    clues.setdefault((a, b), [])
                    clues[(a, b)].append(FULL_NAME)
            #                    print a,b,FULL_NAME
            else:
                for a, b in combinations(sorted(set_uid, key=lambda uid: uid),
                                         2):
                    clues.setdefault((a, b), [])
                    clues[(a, b)].append(SIMPLE_NAME)

    log.info('Done: full/simple name')

    for domain, set_uid in d_domain_uid.items():
        if THR_MIN < len(set_uid) < THR_MAX:
            for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2):
                clues.setdefault((a, b), [])
                clues[(a, b)].append(DOMAIN)
    log.info('Done: email domain')

    for (a, b), list_clues in sorted(clues.items(),
                                     key=lambda e: (e[0][0], e[0][1])):
        if EMAIL in list_clues:
            merge(a, b, EMAIL)
        elif len(list_clues) >= 2:
            for clue in list_clues:
                merge(a, b, clue)
        elif FULL_NAME in list_clues:
            merge(a, b, FULL_NAME)
        elif COMP_EMAIL_PREFIX in list_clues:
            merge(a, b, COMP_EMAIL_PREFIX)
    log.info('Done: clusters')

    for uid, member_uids in clusters.items():
        members = [aliases[m] for m in member_uids]

        # Count fake/real
        real = [m for m in members if m.record_type == USR_REAL]
        # with_location = [m for m in real if m.location is not None]

        # Count rules that fired
        cl = Counter(labels[uid])

        is_valid = False

        # If all have the same email there is no doubt
        if cl.get(EMAIL, 0) >= (len(members) - 1):
            is_valid = True
        # If all the REALs have the same email, assume all the FAKEs are this REAL
        elif len(Counter([m.email for m in real]).keys()) == 1:
            is_valid = True
        # If there is at most one real, at least two rules fired, and each rule applied to each pair
        elif len(real) <= 1 and len(cl.keys()) > 1 and min(
                cl.values()) >= (len(members) - 1):
            is_valid = True
        # At most one real, the only rule that fired is COMP_EMAIL_PREFIX or FULL_NAME
        elif len(real) <= 1 and len(cl.keys()) == 1 and \
                (cl.get(COMP_EMAIL_PREFIX, 0) or cl.get(FULL_NAME, 0)):
            is_valid = True
        # All with same full name and location / same full name and email domain
        elif cl.get(FULL_NAME, 0) >= (len(members) - 1) and \
                (cl.get(LOCATION, 0) >= (len(members) - 1) or cl.get(DOMAIN, 0) >= (len(members) - 1)):
            is_valid = True
        # All fake and same composite email prefix / same full name
        elif len(real) == 0 and \
                (cl.get(COMP_EMAIL_PREFIX, 0) >= (len(members) - 1) or cl.get(FULL_NAME, 0) >= (len(members) - 1)):
            is_valid = True
        else:
            # Split by email address if at least 2 share one
            if cl.get(EMAIL, 0):
                ce = [
                    e for e, c in Counter([m.email for m in members]).items()
                    if c > 1
                ]
                for e in ce:
                    extra_members = [m for m in members if m.email == e]
                    # extra_with_location = [m for m in extra_real if m.location is not None]

                    # if len(extra_real):
                    #     if len(extra_with_location):
                    #         # Pick the one with the oldest account with location, if available
                    #         rep = sorted(extra_with_location, key=lambda m: int(m.uid))[0]
                    #     else:
                    #         # Otherwise pick the one with the oldest account
                    #         rep = sorted(extra_real, key=lambda m: int(m.uid))[0]
                    # else:
                    rep = sorted(extra_members, key=lambda m: m.uid)[0]

                    w_log.writerow([])
                    # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location])
                    w_log.writerow([rep.uid, rep.name, rep.email])
                    for a in extra_members:
                        if a.uid != rep.uid:
                            # w_log.writerow([a.uid, a.login, a.name, a.email, a.location])
                            w_log.writerow([a.uid, a.name, a.email])
                            writer.writerow([a.uid, rep.uid])
                            unmask[a.uid] = rep.uid

            # -- added: Write also maybes to the alias map
            rep = sorted(members, key=lambda m: m.uid)[0]
            # -- end
            w_maybe.writerow([])
            w_maybe.writerow([str(cl.items())])
            for m in members:
                # -- added: added Write also maybes to the alias map
                if m.uid != rep.uid:
                    unmask[m.uid] = rep.uid
                    writer.writerow([m.uid, rep.uid])
                # -- end
                # w_maybe.writerow([m.uid, m.login, m.name, m.email, m.location])
                w_maybe.writerow([m.uid, m.name, m.email])

        if is_valid:
            # Determine group representative
            # if len(real):
            #    if len(with_location):
            #        # Pick the one with the oldest account with location, if available
            #        rep = sorted(with_location, key=lambda m: int(m.uid))[0]
            #    else:
            #        # Otherwise pick the one with the oldest account
            #        rep = sorted(real, key=lambda m: int(m.uid))[0]
            # else:
            rep = sorted(members, key=lambda m: m.uid)[0]

            w_log.writerow([])
            w_log.writerow([str(cl.items())])
            # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location])
            w_log.writerow([rep.uid, rep.name, rep.email])
            for a in members:
                if a.uid != rep.uid:
                    # w_log.writerow([a.uid, a.login, a.name, a.email, a.location])
                    w_log.writerow([a.uid, a.name, a.email])
                    writer.writerow([a.uid, rep.uid])
                    unmask[a.uid] = rep.uid

    log.info("Unmasked size: %d", len(unmask))
    pickle.dump(unmask,
                open(os.path.join(out_dir, 'dict', 'aliasMap.dict'), 'wb'))