def get_continent(name):
    cc = get_cc(name)
    if cc == None:
        return 'Unknown'
    if cc in continent_map:
        return continent_map[cc]
    return transformations.cca_to_ctn(cc)
Beispiel #2
0
def normalizeTagAn():
    continentTags = defaultdict(lambda: defaultdict(lambda: 0))
    with open("tag_ana_country.txt", "r") as f:
        text = f.read().split("\n")
        for line in text:
            parts = line.split("\t")
            if len(parts) == 3 and parts[1] != "":
                try:
                    continent = transformations.cca_to_ctn(parts[0])
                    continentTags[continent][parts[1]] += int(parts[2])
                except:
                    pass

        # count total
        totalTagOccurences = defaultdict(lambda: 0)
        for continent in continentTags:
            for tag in continentTags[continent]:
                totalTagOccurences[continent] += continentTags[continent][tag]

        #normalize
        normalizedContinentTags = defaultdict(lambda: defaultdict(lambda: 0))
        for continent in continentTags:
            for tag in continentTags[continent]:
                normalizedContinentTags[continent][tag] = float(
                    continentTags[continent][tag]) / float(
                        totalTagOccurences[continent])

        #write it to file
        with open("continentTags.txt", 'w') as g:
            for continent in continentTags:
                for tag in normalizedContinentTags[continent]:
                    g.write(
                        str(continent) + "\t" + str(tag) + "\t" +
                        str(normalizedContinentTags[continent][tag]) + "\n")
Beispiel #3
0
def find_continent(location):
    if pd.notnull(location):
        if location in location_continent.keys(): # Check if the info are already in the location_continent dict
            continent = location_continent[location]
            print "%s -> %s (from dict)" %(location, continent)
            return continent
        else:
            if len(location) > 2: #Avoid splitting country codes
                location = re.sub(r"(\w)([A-Z])", r"\1 \2", location) #(e.g. BrevigMission => Brevig Mission)
            request = ''
            invalid = ''
            timeout = time.time() + 30 # Setting timeout for API request (in seconds)
            while not request:
                if time.time() > timeout:
                    return invalid
                time.sleep(1)
                try:
                    request = geolocalize(location.strip())[0]['address_components'][-1]['short_name']
                    print geolocalize(location.strip())
                    print dir(request)
                    continent = transformations.cca_to_ctn(request)
                    print "%s -> %s (from API)" %(location, continent)
                    location_continent[location] = continent
                    return continent
                except:
                    invalid = "invalid continent for %s" %location
    else:
        return 'NaN'
Beispiel #4
0
    def __init__(self, *args, **kwargs):
        super(IpLocation, self).__init__(*args, **kwargs)

        # preserve telize.com format
        self['country_code'] = self['country']
        self['isp'] = self['org']

        self['continent'] = transformations.cca_to_ctn(self['country_code'])
        self['country'] = transformations.cca_to_cn(self['country_code'])
Beispiel #5
0
def get_continent(host_ip):
    geoip = pygeoip.GeoIP(config.get('Config', 'GeoIPData'))

    country = geoip.country_code_by_addr(host_ip)
    if country == '':
        return 'North America'
    elif country == 'EU':
        return 'Europe'
    else:
        return transformations.cca_to_ctn(country)
Beispiel #6
0
def get_continent(country_short):
    '''
    Input: string
    Output: string

    Using the transformations package this function uses a country abreiviation to find the full country name.
    '''

    try:
        continent = transformations.cca_to_ctn(country_short)
    except:
        if country_short == "EU":
            continent = "Europe"
        elif country_short == "AP":
            continent = "Asia"
        else:
            continent = "none"
    return continent
Beispiel #7
0
def get_continent(country_short):
    '''
    Input: string
    Output: string

    Using the transformations package this function uses a country abreiviation to find the full country name.
    '''

    try:
        continent = transformations.cca_to_ctn(country_short)
    except:
        if country_short == "EU":
            continent = "Europe"
        elif country_short == "AP":
            continent = "Asia"
        else:
            continent = "none"
    return continent
Beispiel #8
0
def create_continents(app, schema_editor):
    Country = app.get_model('gmm_region', 'Country')
    Region = app.get_model('gmm_region', 'Region')
    continents = {}
    for country in Country.objects.all():
        try:
            continent_name = transformations.cca_to_ctn(country.country_code)
        except KeyError:
            if country.country_code == 'SX':
                continent_name = 'North America'
            elif country.country_code == 'BQ':
                continent_name = 'South America'
            elif country.country_code == 'CW':
                continent_name = 'South America'
            elif country.country_code == 'SS':
                continent_name = 'Africa'
        finally:
            add_country_to_continent(country, continent_name, continents,
                                     Region)

    for continent_region in continents.values():
        continent_region.save()
def get_pingdom_probes(url):
	response = feedparser.parse(url)

	probes = []

	for item in response['items']:
		if item['pingdom_state'] == "Active":
			#Hack because Pingdom uses UK not GB for the country code alpha
			if item['pingdom_country']['code'] == "UK":
				probe = {
					"ip": item['pingdom_ip'],
					"region": "Europe"
				}

				probes.append(probe)
			else:
				probe = {
					"ip": item['pingdom_ip'],
					"region": transformations.cca_to_ctn(item['pingdom_country']['code'])
				}

				probes.append(probe)

	return probes
Beispiel #10
0
 def _continent(cls, country_code):
     continent_code = transformations.cca_to_ctca2(country_code)
     continent = transformations.cca_to_ctn(country_code)
     return continent_code, continent
Beispiel #11
0
def predictLand(userList,
                cursor,
                X=[],
                y=[],
                mode="grid",
                continentLimit=1000):
    if (X == []):
        #somehow it seems that some user do not have steamid
        userIdList = [user["steamId"] for user in userList]
        userTagDict, userGameDict, userGameTimeDict, gameNameDict = readInGameInformation(
            userIdList, cursor)

        print("All Game Information from DB collected")

        # try to not have too much of the same continents
        continentCounter = defaultdict(lambda: 0)
        chosenContinents = defaultdict(lambda: 0)
        continentTagDict = defaultdict(lambda: defaultdict(lambda: 0))

        gameCount = len(gameNameDict)
        X_game_times = lil_matrix((len(userList), gameCount))

        currUser = 0

        for user in userList:
            if currUser % 300 == 0:
                print(currUser, chosenContinents)
            steamId = str(user["steamId"])

            if str(steamId) in userTagDict:
                userTagList = ' '.join(userTagDict[steamId])
                userGameList = ' '.join(userGameDict[steamId])

                continent = ""
                try:
                    # Maybe invalid countrycode given
                    continent = transformations.cca_to_ctn(
                        user["loccountrycode"])
                except Exception as e:
                    options = {
                        'FX': 'Europe',
                        'YU': 'Europe',
                        'BQ': 'Africa',
                        'SS': 'Africa',
                        'ZR': 'Africa',
                        'CW': 'South America',
                        'SX': 'North America'
                    }
                    continent = options[user["loccountrycode"]]
                    pass

                if continent == 'Antarctica' or continent == 'Africa' or continent == 'Oceania':
                    continue

                continentCounter[continent] += 1

                # for graphs
                for tag in userTagDict[steamId]:
                    continentTagDict[continent][tag] += 1

                if chosenContinents[continent] < continentLimit:
                    chosenContinents[continent] += 1

                    counter = 0
                    for game in gameNameDict:
                        if game in userGameTimeDict[steamId]:
                            amount = userGameTimeDict[steamId][game]
                            if (amount != 0):
                                X_game_times[currUser, counter] = amount
                        counter += 1

                    currUser += 1
                    X.append(userTagList + userGameList)
                    y.append(continent)

        X_reshaped = lil_matrix(X_game_times[:currUser, :])

        saveObject(X, "x_file")
        saveObject(y, "y_file")
        saveObject(X_reshaped, "x_game_times_file")

        print(chosenContinents)
        print("cached x, y and x_game_times")

        count_vect = CountVectorizer()

        X = count_vect.fit_transform(X)
        X_combined = combineLilMats(X, X_reshaped)

        saveObject(X_combined, "X_comb")

        #X_my = count_vect.fit_transform(X_my)
        #X_my2 = combineLilMats(X_my, X_game_times2)

        X = X_combined

    print("Chosen mode: " + mode)
    print("X Data in Shape: " + str(X.shape))
    svd = TruncatedSVD(n_components=3)
    X = svd.fit_transform(X)

    #from mpl_toolkits.mplot3d import Axes3D
    #import matplotlib.pyplot as plt
    #fig = plt.figure()
    #ax = plt.axes(projection='3d')
    #contToNb = {'Europe': 1, 'North America': 2, 'Asia': 3, 'South America': 4}
    #c = [contToNb[x] for x in y ]
    #x_ = [i[0] for i in X]
    #y_ = [i[1] for i in X]
    #z_ = [i[2] for i in X]
    #ax.scatter(x_, y_, z_, c=c)
    #plt.show()

    print("X Data in Shape after TruncatedSVD: " + str(X.shape))
    if mode == "grid" or mode == "rand":

        clfName = "AB"
        pipe = Pipeline([('scaler', MaxAbsScaler()),
                         ('clf', AdaBoostClassifier())])
        if mode == "grid":
            clf = GridSearchCV(pipe,
                               param_grid=getParams(clfName),
                               verbose=10,
                               n_jobs=2)
        else:
            clf = RandomizedSearchCV(pipe,
                                     param_distributions=getParams(clfName),
                                     n_iter=20,
                                     verbose=10,
                                     n_jobs=2)
        classifyAndPrintResults(clf, clfName, X, y, mode=mode)
    elif mode == "tpot":
        clfWithTpot(X, y)
    else:
        classifiers = [("SVC", SVC()), ("RF", RandomForestClassifier()),
                       ("AB", AdaBoostClassifier()),
                       ("KNN", KNeighborsClassifier())]
        for clf_pair in classifiers:
            clfName = clf_pair[0]
            clf = clf_pair[1]

            classifyAndPrintResults(clf, clfName, X, y, mode=mode)
Beispiel #12
0
def migrate_repo(repo):
    # the various components we need to assemble
    opendoar = {}
    metadata = {}
    organisation = {}
    contacts = []
    apis = []
    statistics = {}
    register = {}
    software = {}
    policies = []

    # a record of the patches to be applied to the data (mostly come from the policy data)
    patches = []
    
    # original opendoar id
    odid = repo.get("rID")
    if odid is not None:
        opendoar["rid"] = odid
    
    # repository name
    _extract(repo, "rName", metadata, "name", unescape=True)
    
    # repository acronym
    _extract(repo, "rAcronym", metadata, "acronym", unescape=True)
    
    # repository url
    _extract(repo, "rUrl", metadata, "url")
    
    # oai base url
    oai = {"api_type" : "oai-pmh"}
    _extract(repo, "rOaiBaseUrl", oai, "base_url")
    if "base_url" in oai:
        apis.append(oai)
    
    # organisational details
    _extract(repo, "uName", organisation, "unit", unescape=True)
    _extract(repo, "uAcronym", organisation, "unit_acronym", unescape=True)
    _extract(repo, "uUrl", organisation, "unit_url")
    _extract(repo, "oName", organisation, "name", unescape=True)
    _extract(repo, "oAcronym", organisation, "acronym", unescape=True)
    _extract(repo, "oUrl", organisation, "url")
    _extract(repo, "paLatitude", organisation, "lat", cast=float)
    _extract(repo, "paLongitude", organisation, "lon", cast=float)
    
    cel = repo.find("country")
    _extract(cel, "cIsoCode", metadata, "country_code", lower=True)
    _extract(cel, "cIsoCode", organisation, "country_code", lower=True)

    isocode = cel.find("cIsoCode")
    if isocode is not None:
        code = isocode.text
        if code is not None and code != "":
            try:
                # specify the continent in the metadata
                continent_code = transformations.cca_to_ctca2(code)
                metadata["continent_code"] = continent_code.lower()
                continent = transformations.cca_to_ctn(code)
                metadata["continent"] = continent

                # normalised country name
                country = pycountry.countries.get(alpha2=code.upper()).name
                metadata["country"] = country
                organisation["country"] = country
            except KeyError:
                pass
    
    # repository description
    _extract(repo, "rDescription", metadata, "description", unescape=True)
    
    # remarks
    _extract(repo, "rRemarks", metadata, "description", unescape=True, append=True, prepend="  ")
    
    # statistics
    _extract(repo, "rNumOfItems", statistics, "value", cast=int)
    _extract(repo, "rDateHarvested", statistics, "date")
    
    # established date
    _extract(repo, "rYearEstablished", metadata, "established_date")
    
    # repository type
    _extract(repo, "repositoryType", metadata, "repository_type", aslist=True)
    
    # operational status
    _extract(repo, "operationalStatus", register, "operational_status")
    
    # software
    _extract(repo, "rSoftWareName", software, "name", unescape=True)
    _extract(repo, "rSoftWareVersion", software, "version")
    
    # subject classifications
    classes = repo.find("classes")
    if classes is not None:
        metadata["subject"] = []
        for c in classes:
            subject = {}
            _extract(c, "clCode", subject, "code")
            _extract(c, "clTitle", subject, "term", unescape=True)
            metadata["subject"].append(subject.get("term")) # FIXME: a bit of a round trip here, but will suffice
    
    # languages
    langs = repo.find("languages")
    if langs is not None:
        metadata["language_code"] = []
        metadata["language"] = []
        for l in langs:
            code = l.find("lIsoCode")
            if code is not None and code.text != "":
                lc = code.text.lower()
                lang = pycountry.languages.get(alpha2=lc).name
                metadata["language_code"].append(lc)
                metadata["language"].append(lang)
    
    # content types
    ctel = repo.find("contentTypes")
    if ctel is not None:
        metadata["content_type"] = []
        for ct in ctel:
            metadata["content_type"].append(ct.text)
    
    # policies
    polel = repo.find("policies")
    for p in polel:
        policy = {}
        _extract(p, "policyType", policy, "policy_type")
        posel = p.find("poStandard")
        if posel is not None:
            policy["terms"] = []
            for item in posel:
                t = item.text.strip()

                # only keep terms which have mappings in the policy map
                mapped = policy_map.get(t)
                if mapped is not None:
                    policy["terms"].append(mapped)

                # look for any special instructions on the term
                patch = instruction_map.get(t)
                if patch is not None:
                    patches.append(patch)

        if len(policy.get("terms", [])) > 0:
            policies.append(policy)
    
    # contacts
    conel = repo.find("contacts")
    for contact in conel:
        cont_details = {}
        _extract(contact, "pName", cont_details, "name", unescape=True)
        _extract(contact, "pJobTitle", cont_details, "job_title", unescape=True)
        _extract(contact, "pEmail", cont_details, "email")
        _extract(contact, "pPhone", cont_details, "phone")
        
        has_phone = contact.find("pPhone") is not None and contact.find("pPhone").text is not None
        
        # add the top level repo data about address and phone
        _extract(repo, "postalAddress", cont_details, "address", unescape=True)
        if not has_phone:
            _extract(repo, "paPhone", cont_details, "phone")
        _extract(repo, "paFax", cont_details, "fax")

        # we also add the top level stuff about lat/lon
        if organisation.get("lat") is not None:
            cont_details["lat"] = organisation.get("lat")
        if organisation.get("lon") is not None:
            cont_details["lon"] = organisation.get("lon")

        # record the job title as the contact role for the time being
        full_record = {"details" : cont_details}
        _extract(contact, "pJobTitle", full_record, "role", unescape=True, aslist=True)
        
        contacts.append(full_record)

    # now assemble the object
    register["metadata"] = [
        {
            "lang" : "en",
            "default" : True,
            "record" : metadata
        }
    ]

    if len(software.keys()) > 0:
        register["software"] = [software]
    if len(contacts) > 0:
        register["contact"] = contacts
    if len(organisation.keys()) > 0:
        register["organisation"] = [{"details" : organisation, "role" : ["host"]}] # add a default role
    if len(policies) > 0:
        register["policy"] = policies
    if len(apis) > 0:
        register["api"] = apis

    # final few opendoar admin values
    opendoar["in_opendoar"] = True
    opendoar["last_saved"] = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
    
    record = {
        "register" : register,
        "admin" : {
            "opendoar" : opendoar
        }
    }
    
    statistics["third_party"] = "opendoar"
    statistics["type"] = "item_count"

    # apply any additional field patches
    for patch in patches:
        segments = patch.split("||")
        for s in segments:
            parts = s.split(":", 1)
            field = parts[0]
            try:
                value = json.loads(parts[1])
            except ValueError:
                value = parts[1]
            stack = field.split(".")
            _apply(record, stack, 0, value)

    return record, [statistics]
Beispiel #13
0
def migrate_repo(repo):
    # the various components we need to assemble
    opendoar = {}
    metadata = {}
    organisation = {}
    contacts = []
    apis = []
    statistics = {}
    register = {}
    software = {}
    policies = []
    
    # original opendoar id
    odid = repo.get("rID")
    if odid is not None:
        opendoar["rid"] = odid
    
    # repository name
    _extract(repo, "rName", metadata, "name", unescape=True)
    
    # repository acronym
    _extract(repo, "rAcronym", metadata, "acronym", unescape=True)
    
    # repository url
    _extract(repo, "rUrl", metadata, "url")
    
    # oai base url
    oai = {"api_type" : "oai-pmh"}
    _extract(repo, "rOaiBaseUrl", oai, "base_url")
    if "base_url" in oai:
        apis.append(oai)
    
    # organisational details
    _extract(repo, "uName", organisation, "unit", unescape=True)
    _extract(repo, "uAcronym", organisation, "unit_acronym", unescape=True)
    _extract(repo, "uUrl", organisation, "unit_url")
    _extract(repo, "oName", organisation, "name", unescape=True)
    _extract(repo, "oAcronym", organisation, "acronym", unescape=True)
    _extract(repo, "oUrl", organisation, "url")
    _extract(repo, "paLatitude", organisation, "lat", cast=float)
    _extract(repo, "paLongitude", organisation, "lon", cast=float)
    
    cel = repo.find("country")
    _extract(cel, "cIsoCode", metadata, "country_code", lower=True)
    _extract(cel, "cIsoCode", organisation, "country_code", lower=True)

    isocode = cel.find("cIsoCode")
    if isocode is not None:
        code = isocode.text
        if code is not None and code != "":
            try:
                # specify the continent in the metadata
                continent_code = transformations.cca_to_ctca2(code)
                metadata["continent_code"] = continent_code.lower()
                continent = transformations.cca_to_ctn(code)
                metadata["continent"] = continent

                # normalised country name
                country = pycountry.countries.get(alpha2=code.upper()).name
                metadata["country"] = country
                organisation["country"] = country
            except KeyError:
                pass
    
    # repository description
    _extract(repo, "rDescription", metadata, "description", unescape=True)
    
    # remarks
    _extract(repo, "rRemarks", metadata, "description", unescape=True, append=True, prepend="  ")
    
    # statistics
    _extract(repo, "rNumOfItems", statistics, "value", cast=int)
    _extract(repo, "rDateHarvested", statistics, "date")
    
    # established date
    _extract(repo, "rYearEstablished", metadata, "established_date")
    
    # repository type
    _extract(repo, "repositoryType", metadata, "repository_type", aslist=True)
    
    # operational status
    _extract(repo, "operationalStatus", register, "operational_status")
    
    # software
    _extract(repo, "rSoftWareName", software, "name", unescape=True)
    _extract(repo, "rSoftWareVersion", software, "version")
    
    # subject classifications
    classes = repo.find("classes")
    if classes is not None:
        metadata["subject"] = []
        for c in classes:
            subject = {}
            _extract(c, "clCode", subject, "code")
            _extract(c, "clTitle", subject, "term", unescape=True)
            metadata["subject"].append(subject)
    
    # languages
    langs = repo.find("languages")
    if langs is not None:
        metadata["language_code"] = []
        metadata["language"] = []
        for l in langs:
            code = l.find("lIsoCode")
            if code is not None and code.text != "":
                lc = code.text.lower()
                lang = pycountry.languages.get(alpha2=lc).name
                metadata["language_code"].append(lc)
                metadata["language"].append(lang)
    
    # content types
    ctel = repo.find("contentTypes")
    if ctel is not None:
        metadata["content_type"] = []
        for ct in ctel:
            metadata["content_type"].append(ct.text)
    
    # policies
    polel = repo.find("policies")
    for p in polel:
        policy = {}
        _extract(p, "policyType", policy, "policy_type")
        _extract(p, "policyGrade", policy, "policy_grade")
        posel = p.find("poStandard")
        if posel is not None:
            policy["terms"] = []
            for item in posel:
                policy["terms"].append(item.text)
        policies.append(policy)
    
    # contacts
    conel = repo.find("contacts")
    for contact in conel:
        cont_details = {}
        _extract(contact, "pName", cont_details, "name", unescape=True)
        _extract(contact, "pJobTitle", cont_details, "job_title", unescape=True)
        _extract(contact, "pEmail", cont_details, "email")
        _extract(contact, "pPhone", cont_details, "phone")
        
        has_phone = contact.find("pPhone") is not None and contact.find("pPhone").text is not None
        
        # add the top level repo data about address and phone
        _extract(repo, "postalAddress", cont_details, "address", unescape=True)
        if not has_phone:
            _extract(repo, "paPhone", cont_details, "phone")
        _extract(repo, "paFax", cont_details, "fax")

        # we also add the top level stuff about lat/lon
        if organisation.get("lat") is not None:
            cont_details["lat"] = organisation.get("lat")
        if organisation.get("lon") is not None:
            cont_details["lon"] = organisation.get("lon")

        # record the job title as the contact role for the time being
        full_record = {"details" : cont_details}
        _extract(contact, "pJobTitle", full_record, "role", unescape=True, aslist=True)
        
        contacts.append(full_record)

    # now assemble the object
    register["metadata"] = [
        {
            "lang" : "en",
            "default" : True,
            "record" : metadata
        }
    ]
    register["software"] = [software]
    register["contact"] = contacts
    register["organisation"] = [{"details" : organisation, "role" : ["host"]}] # add a default role
    register["policy"] = policies
    register["api"] = apis
    
    opendoar["in_opendoar"] = True
    
    record = {
        "register" : register,
        "admin" : {
            "opendoar" : opendoar
        }
    }
    
    statistics["third_party"] = "opendoar"
    statistics["type"] = "item_count"
    
    return record, [statistics]
Beispiel #14
0
 def detect(self, register, info):
     code = register.country_code
     continent_code = transformations.cca_to_ctca2(code)
     continent = transformations.cca_to_ctn(code)
     register.set_continent(name=continent, code=continent_code)
     log.info("Determined continent from country: " + code + " -> " + continent)