Beispiel #1
0
def determineGenders(twitterUserData):
	detector = GenderDetector('us')
	for screenName in twitterUserData.keys():
		print("Getting gender for: {0} ".format(screenName))
		nam = twitterUserData[screenName]['name']
		#print(nam)
		toCheck = None
		if nam != '.':
			if " " in nam:
				toCheck = nam.split(" ")[0]

			if toCheck is not None:
				twitterUserData[screenName]['gender'] = detector.guess(toCheck)
			else:
				print(nam)
				twitterUserData[screenName]['gender'] = detector.guess(nam)


	print("Creating backup file:")
	with open('backup2.json','w') as f:
		json.dump(twitterUserData,f)
		#print(twitterUserData)

	print("Results")
	for result in twitterUserData.keys():
		print(result)
		print("\tid: {0}".format(result))
		print("\tscreenName: {0}".format(twitterUserData[result]['screen_name']))
		print("\tgender: {0}".format(twitterUserData[result]['gender']))
	return twitterUserData
Beispiel #2
0
def user(request, user_id):
    connect()
    user = dict(models.Recommendation.get(user_gid=user_id))
    name = user['username'].replace('[', '').split(" ")
    detector = GenderDetector('us')
    gender = ""
    try:
        gender = detector.guess(name[0])
    except Exception as e:
        gender = "unknown"
    user['gender'] = gender
    for j, book in enumerate(user['books_details_recommended']):
        for i, shelve in enumerate(book['list_shelves']):
            for best_shelve in user['most_common_shelves']:
                if (best_shelve.shelve == shelve.shelve):
                    last_shelve = user['books_details_recommended'][j][
                        'list_shelves'][i]
                    new_shelve = models.shelve(count=last_shelve.count,
                                               votes=last_shelve.votes,
                                               gid=last_shelve.gid,
                                               best=True,
                                               shelve=last_shelve.shelve)
                    user['books_details_recommended'][j]['list_shelves'][
                        i] = new_shelve
    return render(request, 'recom/user.html', {'user': user})
def find_out_genders(
        twitterUserData
):  # figure out genders of users based on user's first name
    detector = GenderDetector('us')
    for screenName in twitterUserData.keys():
        print("Getting gender for: {0} ".format(screenName))
        nam = twitterUserData[screenName]['name']
        toCheck = None
        if nam != '.':
            if " " in nam:
                toCheck = nam.split(" ")[0]
            if toCheck is not None:
                twitterUserData[screenName]['gender'] = detector.guess(toCheck)
            else:
                print(nam)
                twitterUserData[screenName]['gender'] = detector.guess(nam)
    print("Creating backup file:")
    with open('backup2.json', 'w') as f:
        json.dump(twitterUserData, f)
    print("Results")
    for result in twitterUserData.keys():
        print(result)
        print("\tid: {0}".format(result))
        print("\tscreenName: {0}".format(
            twitterUserData[result]['screen_name']))
        print("\tgender: {0}".format(twitterUserData[result]['gender']))
    return twitterUserData
Beispiel #4
0
    def __init__(self):
        # Settings
        self.all_data_file_name = './csv_files/altgender4_2017_12_12_upwork_analysis_unitedstates_allskills.csv'  # Filename for all data
        self.data_log_file_name = './log_files/alt_gender4_log_upwork_data_analysis_2017_12_12_unitedstates_allskills.txt'

        # Write a log
        self.log = open(self.data_log_file_name, 'a')
        self.log.write("We have started analyzing data!" + "\n")
        self.log.flush()

        # Connect to the database
        self.conn = psycopg2.connect("dbname=eureka01")
        self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
        psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)

        # Get detailed_info from workers in our database
        self.cur.execute(
            "SELECT detailed_info FROM upwork_unitedstates_allskills_2017_12_12;"
        )

        # Initialize arrays for Causal Analysis
        self.user_count = 1

        # Initialize gender detectors
        self.d = gender.Detector()
        self.gc = GenderComputer('./nameLists')
        self.us_detector = GenderDetector('us')
        self.ar_detector = GenderDetector('ar')
        self.uk_detector = GenderDetector('uk')
        self.uy_detector = GenderDetector('uy')
        self.gender_guesser = gender_guesser.Detector()
Beispiel #5
0
 def test_format_name(self):
     detector = GenderDetector()
     for name in ['mARCOS', ' Marcos ', 'Marcos    ', 'MARCOS']:
         self.assertEqual(
             detector._format_name(name),
             'Marcos'
         )
def predict_gender(name_phrase):
    detector = GenderDetector(country='us', unknown_value='')

    names = _strip_punctuation(name_phrase).split(' ')
    genders = [g for g in [detector.guess(name) for name in names] if g]

    if genders and all(gender == 'male' for gender in genders):
        return 'male'
    if any(gender == 'female' for gender in genders):
        return 'female'
    return ''
Beispiel #7
0
def gender_identify(name):
    name = re.findall(pattern, name, re.M)
    detector = GenderDetector('uk')
    try:
        temp_name = name[0]
        if ' ' in temp_name:
            temp_name = temp_name.split(" ")
            return detector.guess(temp_name[0])
        else:
            return detector.guess(temp_name)
    except:
        return None
def test_calculate_all_spec_props():
    """
    Test the calculate_all_spec_props function.
    """
    expected = [
        8.26875000e+03, 6.16316236e+03, 8.26875000e+03, 4.13437500e+03,
        1.24031250e+04, 8.26875000e+03, 0.00000000e+00, -1.36000000e+00,
        1.01140426e+00, 1.18447081e-04, 1.00145455e+03, 0.00000000e+00,
        1.65375000e+04, 1.65375000e+04, 3.33333333e-01, 1.92518695e+00,
        6.11016956e+00, -1.28009780e-01, 1.00145455e+03, 5.46327273e+02,
        9.47490909e+02, 8.76072727e+02, 9.77745455e+02, 9.32945455e+02,
        9.90400000e+02, 1.01032727e+03, 1.01789091e+03, 1.04974545e+03
    ]

    y = np.array([
        10793861000.0, 18623418.961417913, 71.06335201775947, 735.660880806231
    ])
    frq = np.array([0., 5512.5, 11025., 16537.5])
    ceps = np.array([6.11016956, 0.85929401, -0.12800978, 0.85929401])
    esp_frecuencia_pairs = [(1433045920722.5535, 1001.4545454545454),
                            (1432482084982.5637, 546.32727272727266),
                            (1429332907671.3792, 947.4909090909091),
                            (1424138212407.696, 876.07272727272721),
                            (1423571652995.4846, 977.74545454545455),
                            (1420207470649.9392, 932.94545454545448),
                            (1417896507230.5828, 990.39999999999998),
                            (1413376292432.0315, 1010.3272727272727),
                            (1408203258629.1887, 1017.8909090909091),
                            (1402265894500.063, 1049.7454545454545)]
    result = GenderDetector().calculate_all_spec_props(frq, y, ceps,
                                                       esp_frecuencia_pairs)
    assert np.allclose(expected, result)
def index(request):
    connect()
    cursor = connection.cursor()
    rows = cursor.execute("SELECT id, user_gid, precision, recall, username, gender FROM prs.recommendation where common_shelves_retrieved=true") 
    users=sorted(rows, key=lambda k: k['precision'], reverse=True) 
    detector = GenderDetector('us')
    for i, user in enumerate(users):
        name=user['username'].replace('[', '').split(" ")
        gender=""
        try:
            gender=detector.guess(name[0])
        except Exception as e:
            gender="unknown"

        new_user=dict(models.Recommendation( user_gid=user['user_gid'], gender=gender,precision=user['precision'], recall=user['recall'], username=user['username']))
        users[i]=new_user
    return render(request, 'recom/index.html', {'users': users})
def user(request, user_id): 
    connect()
    user=dict(models.Recommendation.get(user_gid=user_id))
    name=user['username'].replace('[', '').split(" ")
    detector = GenderDetector('us')
    gender=""
    try:
        gender=detector.guess(name[0])
    except Exception as e:
        gender="unknown"
    user['gender']=gender
    for j, book in enumerate(user['books_details_recommended']):
        for i, shelve in enumerate(book['list_shelves']):
            for best_shelve in user['most_common_shelves']:
                if(best_shelve.shelve==shelve.shelve):
                    last_shelve=user['books_details_recommended'][j]['list_shelves'][i]
                    new_shelve=models.shelve(count=last_shelve.count,votes=last_shelve.votes, gid=last_shelve.gid,
                                             best=True,shelve=last_shelve.shelve)
                    user['books_details_recommended'][j]['list_shelves'][i]=new_shelve
    return render(request, 'recom/user.html', {'user': user}) 
def test_hps():
    """
    Test the hps function.
    """
    arr = [255, 255, 200, 300, 500, 100, 600, 0]
    expected = [
        16581375.0, 15300000.0, 60000000.0, 180000.0, 500.0, 100.0, 600.0, 0.0
    ]
    result = GenderDetector().hps(arr)
    result = [float(item) for item in result]
    assert np.allclose(expected, result)
def test_calculate_modulation_index():
    """
    Test the calculate_modulation_index function.
    """
    arr = [255, 255, 200, 300, 500, 100, 600, 0]
    arr = np.array(arr)
    mindom = 11025.
    maxdom = 5512.5
    dfrange = 1000.0
    expected = 0.265
    result = GenderDetector().calculate_modulation_index(
        arr, mindom, maxdom, dfrange)
    assert np.allclose(expected, result)
Beispiel #13
0
def upload_file():
    if request.method == 'POST':
        file_wav = request.files['file']
        # if user does not select file, browser also
        # submit a empty part without filename
        if file_wav.filename == '':
            return redirect(request.url)
        if file_wav and allowed_file(file_wav.filename):
            fs, signal = scipy.io.wavfile.read(file_wav)
            result = GenderDetector().process(fs, signal)
            return jsonify(result)

    return render_template('main.html')
Beispiel #14
0
def index(request):
    connect()
    cursor = connection.cursor()
    rows = cursor.execute(
        "SELECT id, user_gid, precision, recall, username, gender FROM prs.recommendation where common_shelves_retrieved=true"
    )
    users = sorted(rows, key=lambda k: k['precision'], reverse=True)
    detector = GenderDetector('us')
    for i, user in enumerate(users):
        name = user['username'].replace('[', '').split(" ")
        gender = ""
        try:
            gender = detector.guess(name[0])
        except Exception as e:
            gender = "unknown"

        new_user = dict(
            models.Recommendation(user_gid=user['user_gid'],
                                  gender=gender,
                                  precision=user['precision'],
                                  recall=user['recall'],
                                  username=user['username']))
        users[i] = new_user
    return render(request, 'recom/index.html', {'users': users})
def test_preprocess():
    """
    Test the preprocess function.
    """
    inputFileName = './tests/wav/1F.wav'
    fs, signal = scipy.io.wavfile.read(inputFileName)
    expected = [
        5.87534884e+02, 2.95871868e+02, 5.87534884e+02, 3.31348837e+02,
        8.43720930e+02, 5.12372093e+02, -1.17632943e-15, -1.20000008e+00,
        8.47444655e+00, 2.75899724e-02, 2.07813953e+02, 7.51627907e+01,
        1.09990698e+03, 1.02474419e+03, 1.81554103e-04, 7.07795704e-04,
        1.01110553e+01, -2.22503906e-01, 2.07813953e+02, 2.12465116e+02,
        2.23627907e+02, 2.37023256e+02, 2.35906977e+02, 2.34418605e+02,
        2.06139535e+02, 2.36093023e+02, 2.36465116e+02, 2.23813953e+02
    ]
    result = GenderDetector().preprocess(fs, signal)
    assert np.allclose(expected, result)
def test_calculate_spectrum_cepstrum_mono():
    """
    Test the calculate_spectrum_cepstrum function
    when the signal is mono.
    """
    arr = [255, 255, 200, 300, 500, 100, 600, 0]
    arr = np.array(arr)
    fs = 44100

    expected_y = np.array([
        10793861000.0, 18623418.961417913, 71.06335201775947, 735.660880806231
    ])
    expected_freq = np.array([0., 5512.5, 11025., 16537.5])
    expected_ceps = np.array([6.11016956, 0.85929401, -0.12800978, 0.85929401])

    y, freq, ceps = GenderDetector().calculate_spectrum_cepstrum(arr, fs)
    assert np.allclose(expected_y, y) and np.allclose(
        expected_freq, freq) and np.allclose(expected_ceps, ceps)
def test_process():
    """
    Test the process function.
    """
    response = {
        "data": {
            "gender": {
                "id": "FEMALE",
                "name": "The gender of this person is : FEMALE"
            }
        }
    }

    inputFileName = './tests/wav/1F.wav'
    fs, signal = scipy.io.wavfile.read(inputFileName)
    result = GenderDetector().process(fs, signal)
    assert (response["data"]["gender"]["id"] == result["data"]["gender"]["id"]
            and response["data"]["gender"]["name"]
            == result["data"]["gender"]["name"])
def test_get_n_fundamental_frequencies():
    """
    Test the get_n_fundamental_frequencies function.
    """
    arr = [(1433045920722.5535, 1001.4545454545454),
           (1432482084982.5637, 546.32727272727266),
           (1429332907671.3792, 947.4909090909091),
           (1424138212407.696, 876.07272727272721),
           (1423571652995.4846, 977.74545454545455),
           (1420207470649.9392, 932.94545454545448),
           (1417896507230.5828, 990.39999999999998),
           (1413376292432.0315, 1010.3272727272727),
           (1408203258629.1887, 1017.8909090909091),
           (1402265894500.063, 1049.7454545454545)]
    n = 3
    expected = np.array(
        [1001.4545454545454, 546.32727272727266, 947.4909090909091])
    result = GenderDetector().get_n_fundamental_frequencies(n, arr)
    assert np.allclose(expected, result)
Beispiel #19
0
from gender_detector import GenderDetector
detector = GenderDetector('us')  # It can also be ar, uk, uy.

count = 0

### guesses gender based on firstname.good for mostly english names
with open('/Users/ankitkumar/Downloads/firstname.csv') as f:
    for line in f:
        try:
            if detector.guess(line) == 'unknown':
                print count
                count = count + 1
        except (KeyError, NameError):
            print "skip"

print count
Beispiel #20
0
class UpworkDataFormatter:
    def __init__(self):
        # Settings
        self.all_data_file_name = './csv_files/altgender4_2017_12_12_upwork_analysis_unitedstates_allskills.csv'  # Filename for all data
        self.data_log_file_name = './log_files/alt_gender4_log_upwork_data_analysis_2017_12_12_unitedstates_allskills.txt'

        # Write a log
        self.log = open(self.data_log_file_name, 'a')
        self.log.write("We have started analyzing data!" + "\n")
        self.log.flush()

        # Connect to the database
        self.conn = psycopg2.connect("dbname=eureka01")
        self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
        psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)

        # Get detailed_info from workers in our database
        self.cur.execute(
            "SELECT detailed_info FROM upwork_unitedstates_allskills_2017_12_12;"
        )

        # Initialize arrays for Causal Analysis
        self.user_count = 1

        # Initialize gender detectors
        self.d = gender.Detector()
        self.gc = GenderComputer('./nameLists')
        self.us_detector = GenderDetector('us')
        self.ar_detector = GenderDetector('ar')
        self.uk_detector = GenderDetector('uk')
        self.uy_detector = GenderDetector('uy')
        self.gender_guesser = gender_guesser.Detector()

    def save_all_to_csv(self):
        with open(self.all_data_file_name, 'w') as csvfile:
            fieldnames = [
                'user_count', 'worker_id', 'first_name', 'profile_desc',
                'gender_guesser', 'gender_detector', 'sex_machine',
                'gender_computer', 'gender_pronoun'
            ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for user in self.cur:
                try:
                    user_count = self.user_count
                    worker_id = user[0]["ciphertext"]
                    first_name = user[0]["dev_first_name"].encode(
                        'utf-8').strip()
                    print "Successfully done first_name"
                    profile_desc = user[0]['dev_blurb'].encode('utf-8').strip()
                    gender_guesser = self.gender_by_guesser(
                        first_name)  # Output of gender guesser
                    gender_detector = self.gender_by_detector(
                        first_name)  # Output of gender detector
                    sex_machine = self.gender_by_sex_machine(
                        first_name)  # Output of sex machine
                    gender_computer = self.gender_by_computer(
                        first_name)  # Output of gender computer
                    gender_pronoun = self.gender_by_pronoun(
                        user)  # Output of pronoun in reviews

                    writer.writerow({
                        'user_count': self.user_count,
                        'worker_id': worker_id,
                        'first_name': first_name,
                        'profile_desc': profile_desc,
                        'gender_guesser': gender_guesser,
                        'gender_detector': gender_detector,
                        'sex_machine': sex_machine,
                        'gender_computer': gender_computer,
                        'gender_pronoun': gender_pronoun
                    })

                except KeyboardInterrupt:
                    print "We got interrupted"
                    break

                except Exception as error:
                    print "Ran into some error at user {0}".format(
                        self.user_count)
                    print error

                    writer.writerow({
                        'user_count': self.user_count,
                        'worker_id': "error",
                        'first_name': "error",
                        'profile_desc': "error",
                        'gender_guesser': "error",
                        'gender_detector': "error",
                        'sex_machine': "error",
                        'gender_computer': "error",
                        'gender_pronoun': "error"
                    })

                print "Finished writing data for {0}".format(self.user_count)
                self.user_count += 1

    def gender_by_detector(self, name):
        try:
            us_gender = self.us_detector.guess(
                name)  #Check against US database
            return us_gender
        except Exception as error:
            print "Something wrong at gender_by_detector: {0}".format(error)

    def gender_by_computer(self, name):
        try:
            unicode_name = unicode(name, "utf-8")
            gender = self.gc.resolveGender(unicode_name, 'USA')
            return gender
        except Exception as error:
            print "Something wrong at gender_by_computer: {0}".format(error)

    def gender_by_sex_machine(self, name):
        try:
            unicode_name = unicode(name, "utf-8")
            gender = self.d.get_gender(unicode_name)
            return gender
        except Exception as error:
            print "Something wrong at gender_by_sex_machine: {0}".format(error)

    def gender_by_guesser(self, name):
        try:
            unicode_name = unicode(name, "utf-8")
            gender = self.gender_guesser.get_gender(unicode_name)
            return gender
        except Exception as error:
            print "Something wrong at gender_by_guesser: {0}".format(error)

    def gender_by_pronoun(self, user):
        is_female = False
        is_male = False

        try:
            all_assignments = user[0]["assignments"]
            all_fp_assignments = all_assignments["fp"]["job"]
            all_hr_assignments = all_assignments["hr"]["job"]

            if (type(all_fp_assignments) == list):
                for job in all_fp_assignments:
                    try:
                        try:
                            feedback = job["feedback"]["comment"]
                            print feedback
                        except:
                            feedback = job["feedback"][0]["comment"]
                            print feedback
                        is_female = re.search(" her |Her |She | she | her/.",
                                              feedback)
                        is_male = re.search(" his |His |He | he | him | him/.",
                                            feedback)
                        break
                    except:
                        continue
            elif (type(all_fp_assignments == dict)):
                try:
                    feedback = all_fp_assignments["feedback"]["comment"]
                    print feedback
                except:
                    feedback = job["feedback"][0]["comment"]
                    print feedback
                is_female = re.search(" her |Her |She | she | her/.", feedback)
                is_male = re.search(" his |His |He | he | him | him/.",
                                    feedback)

            if not is_female and not is_male:
                if (type(all_hr_assignments) == list):
                    for job in all_hr_assignments:
                        try:
                            try:
                                feedback = job["feedback"]["comment"]
                                print feedback
                            except:
                                job["feedback"][0]["comment"]
                                print feedback
                            is_female = re.search(
                                " her |Her |She | she | her/.", feedback)
                            is_male = re.search(
                                " his |His |He | he | him | him/.", feedback)
                            break
                        except:
                            continue
                elif (type(all_hr_assignments) == dict):
                    try:
                        feedback = all_hr_assignments["feedback"]["comment"]
                        print feedback
                    except:
                        feedback = all_hr_assignments["feedback"][0]["comment"]
                        print feedback
                    is_female = re.search(" her |Her |She | she | her/.",
                                          feedback)
                    is_male = re.search(" his |His |He | he | him | him/.",
                                        feedback)

        except Exception as error:
            print "Could not find assignments for gender_by_pronoun: {0}".format(
                error)
            return "unknown"

        if is_female and not is_male:
            return "female"
        elif is_male and not is_female:
            return "male"
        elif is_male and is_female:
            return "ambiguous"
        else:
            return "unknown"
from TwitterSearch import *
import time
from textblob import TextBlob
from gender_detector import GenderDetector
from geopy.geocoders import Nominatim
import datetime

var = 1
while var == 1 :  # This constructs an infinite loop
    detector = GenderDetector() # It can also be ar, uk, uy.
    try:
        tso = TwitterSearchOrder()
        tso.set_keywords(['starbucks', 'red bull', 'dunkin donut', '#starbucks'], or_operator = True)
        tso.set_language('en')
        ts = TwitterSearch(
            consumer_key = 'your twitter consumer key',
            consumer_secret = 'your twitter consumer secret',
            access_token = 'your twitter access token',
            access_token_secret = 'your twitter access token secret'
         )
        sleep_for = 60 # sleep for 60 seconds
        last_amount_of_queries = 0 # used to detect when new queries are done

        for tweet in ts.search_tweets_iterable(tso):
            #initialize variables needed later
            user_lat = None
            user_lon = None
            print "user id:",tweet['user']['id']
            print "Tweet Text:",tweet['text']
            parse_tweet = TextBlob(tweet['text'])
            parse_tweet.sentiment
Beispiel #22
0
"""
Created on Fri Nov 11 08:37:50 2016

@author: varshith
"""
print(''' 
us - USA
ar - Argentina
uy - Uruguay
uk - United Kingdom
''')
cclist = ['us', 'ar', 'uy', 'uk']

while True:
    country_code = input('Enter country code :')
    if country_code in cclist:
        break
    else:
        print('Entered country code not valid')

while True:
    name = input('Enter first name :')
    if name.find(' ') != -1:
        print('Entered name not valid, Enter first name only')
    else:
        break

from gender_detector import GenderDetector
detector = GenderDetector(country_code)
print(detector.guess(name))
# infer the gender of each author according to the 'gs_name.txt'

from datetime import datetime
from gender_detector import GenderDetector
detector = GenderDetector('us')

file_name = open("gs_name.txt", "r")
lines_name = file_name.read().split("\n")

all_name = []
all_gender = []

for x in lines_name:
    if (len(x) > 0):
        all_name.append(x)
        yy = x.split(" ")
        #       print yy[0],
        author_gender = 'unknown'
        try:
            author_gender = detector.guess(yy[0])
        except:
            pass
        print author_gender
        all_gender.append(author_gender)
 def __init__(self):
   self.detector = GenderDetector('us')
   self.load_name_org_online()
Beispiel #25
0
 def __init__(self):
     self.gender_detector = GenderDetector()
     self.googlemaps_api = GoogleMaps(
         api_key=app_settings.SERVICES_CREDENTIALS['google_api_key'])
 def test_guessing(self):
     detector = GenderDetector('us')
     self.assertEqual(detector.guess('Marcos'), 'male')
 def test_format_name(self):
     detector = GenderDetector()
     for name in ['mARCOS', ' Marcos ', 'Marcos    ', 'MARCOS']:
         self.assertEqual(detector._format_name(name), 'Marcos')
def are_opposite(first, second):
    detector = GenderDetector('us')
    for first, second in [(first, second), (second, first)]:
        if detector.guess(first) == 'male' and detector.guess(second) == 'female':
            return True
    return False
Beispiel #29
0
 def test_guessing(self):
     detector = GenderDetector('us')
     self.assertEqual(detector.guess('Marcos'), 'male')
Beispiel #30
0
from __future__ import division
from gender_detector import GenderDetector
from nltk import RegexpTokenizer

detectorus = GenderDetector('us')  # It can also be ar, uk, uy.
detectorar = GenderDetector('ar')
detectoruk = GenderDetector('uk')
detectoruy = GenderDetector('uy')
detector = [detectorus, detectoruk, detectoruy, detectorar]

FemaleFeatures = [
    'mrs', 'miss', 'ms', 'girl', 'woman', 'lady', 'madam', 'mother', 'mom',
    'mommy', 'munt'
]
FemaleSuffix = ['tress', 'tess', 'cess']
MaleFeatures = [
    'mr', 'man', 'boy', 'gentleman', 'sir', 'father', 'dad', 'daddy', 'uncle'
]

tokenizer = RegexpTokenizer(r'[A-Z]+')

# this methods is very expensive, and it's the bottleneck of the code.
# most of the time will be consumed in determining whether the character is Male or Female
# the library here is a opensource library called gender_detector,
# the installation instruction could be seen in README.txt


def gender_gusser(entity, withFeature=False):
    '''

    :param entity: The name enitity found in the scripts
Beispiel #31
0
class TwitterTransformer():
    def __init__(self):
        self.gender_detector = GenderDetector()
        self.googlemaps_api = GoogleMaps(
            api_key=app_settings.SERVICES_CREDENTIALS['google_api_key'])

    def process(self, ds, **kwargs):
        raw_records = self.__fetch_tweets()
        print "{} new tweets have been analyzed".format(len(raw_records))
        conn_db_lake = db_handler.get_connection('data_lake')
        cur_db_lake = conn_db_lake.cursor()

        for record in raw_records:
            tweet = record[2]
            clean_tweet = self.__tweet_cleaner(tweet['text'])
            print clean_tweet
            polarity, sentiment = self.__get_sentiment(clean_tweet)
            coordinates = self.__get_go_points(tweet['user']['location'])
            gender = self.__guess_gender(tweet['user']['name'].split()[0])
            tweet_tokens = self.__tokenizer(clean_tweet)
            processed_tweet = {
                "author": tweet["user"]["screen_name"],
                "tweet_geo": tweet['geo'],
                "tweet_lang": tweet['lang'],
                "tweet_place": tweet['place'],
                "user_description": tweet['user']['description'],
                "user_followers_count": tweet['user']['followers_count'],
                "user_friends_count": tweet['user']['friends_count'],
                "user_lang": tweet['user']['lang'],
                "user_name": tweet['user']['name'],
                "user_location_name": tweet['user']['location'],
                "user_location_coordinate": {
                    "lat": coordinates[0],
                    "lon": coordinates[1]
                } if coordinates else None,
                "user_status_count": tweet['user']['statuses_count'],
                "tweet_created_at": str(parser.parse(tweet['created_at'])),
                "user_created_at":
                str(parser.parse(tweet['user']['created_at'])),
                "tweet_tokens": tweet_tokens,
                'bigrams': ["_".join(x) for x in bigrams(tweet_tokens)],
                'trigrams': ["_".join(x) for x in trigrams(tweet_tokens)],
                "polarity": polarity,
                "sentiment": sentiment,
                "gender": gender,
            }

            try:
                update_query = """
                UPDATE records
                SET is_analyzed=TRUE
                WHERE id={};
                """.format(record[0])
                query = """INSERT INTO tweets (data, created_at) VALUES ('{}', '{}')""".format(
                    json.dumps(processed_tweet).replace("'", "''"), record[3])

                cur_db_lake.execute(query)
                cur_db_lake.execute(update_query)
                conn_db_lake.commit()
            except Exception as ex:
                conn_db_lake.rollback()
                raise ex

    def __fetch_tweets(self):
        try:
            conn_db_lake = db_handler.get_connection('data_lake')
            cur_db_lake = conn_db_lake.cursor()

            query = """
            SELECT * FROM records
            WHERE type='tweet' AND is_analyzed = false
            """

            cur_db_lake.execute(query)
            return cur_db_lake.fetchall()
        except Exception as ex:
            conn_db_lake.rollback()
            raise ex

    def __guess_gender(self, name):
        gender = None
        try:
            gender = self.gender_detector.guess(name)
            return gender
        except Exception as e:
            print('error in gender detector')

    def __get_go_points(self, address):
        if not address:
            return None
        coordinate = None
        try:
            res = self.googlemaps_api.search(
                address.strip(string.punctuation + ' ')).first()
            if res:
                coordinate = [res.lat, res.lng]
        except Exception as ex:
            print("Err in geo location convertor")

        return coordinate

    def __tweet_cleaner(self, tweet):
        # Convert to lower case
        tweet = tweet.lower()
        # Convert www.* or https?://* to empty string
        tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))', '', tweet)
        # Convert @username to empty string
        tweet = re.sub('@[^\s]+', '', tweet)
        # Remove additional white spaces
        tweet = re.sub('[\s]+', ' ', tweet)
        # Replace #word with word
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
        # trim
        tweet = tweet.strip('\'"')

        return tweet

    def __get_sentiment(self, tweet):
        res = TextBlob(tweet)
        polarity = res.sentiment.polarity
        if polarity < 0:
            sentiment = 'negative'
        elif polarity == 0:
            sentiment = 'neutral'
        else:
            sentiment = 'positive'

        return (polarity, sentiment)

    def __tokenizer(self, tweet):
        tokens = []
        for word in tweet.split():
            if len(word) > 3 and word not in stopwords.words(
                    'english') and wordnet.synsets(word):
                tokens.append(word)
        return list(set(tokens))
        #word_final.append(word_processed)
#to correct the format we receive the words and take as a chain in list
word_final =list(itertools.chain(*word_final))
#print word_final
rec=open("Names_processed.txt", "w")

for item in word_final:
    rec.write("%s\n" % item)
rec.close()


for item in word_final:
    c= segment(item)
    d= max(c, key=len)
    #print d
    detector = GenderDetector('us') # It can also be uk.
    e= detector.guess(d) # => 'male'
    print d,e
   
    

"""
rec=open("Names_processed.txt", "w")

for item in word_final:
    rec.write("%s\n" % item)
rec.close()

 

with open("Names_processed.txt", 'r') as word:
class BylineGender():
  def __init__(self):
    self.detector = GenderDetector('us')
    self.load_name_org_online()

  def byline_gender(self,byline):
    gender_result = {"female":0, "male":0,"unknown":0}
    for name in self.get_first_names(byline):
      if(name is None):
        gender_result["unknown"] += 1
      else:
        gender_result[str(self.detector.guess(name))] += 1
    return gender_result

  def single_name_gender(self,name):
    if(name is None or len(name.strip()) == 0):
      return "unknown" 
    n = self.get_first_names(name.strip())
    if len(n) > 0 and n[0] is not None:
      return self.detector.guess(n[0])
    return "unknown"

  def single_name_gender_ascii(self,name):
    name = unicodedata.normalize('NFKD', name).encode('ascii','ignore')
    if(name is None or len(name.strip()) == 0):
      return "unknown" 
    n = self.get_first_names(name.strip())
    if len(n) > 0 and n[0] is not None:
      return self.detector.guess(n[0])
    return "unknown"

  # needs error handling
  def load_name_org_online(self):
    self.online_names = {}
    url = "https://docs.google.com/spreadsheets/d/1TTX5ymLPjefIrHep2QmHZNb76VunFwfq0x6FNAXjUZk/export?format=csv"
    response = requests.get(url)
    csv_string = response.content
    f = StringIO.StringIO(csv_string)
    eval_base = csv.reader(f, delimiter=',')
    eval_base.next()
    try:
      for row in eval_base:
        org = row[0].decode("iso-8859-1").replace(u"+",u" ")
        name = row[1].decode("iso-8859-1").replace(u"+",u" ")
        gender = row[4]
        if(not org in self.online_names.keys()):
          self.online_names[org]= {}
        if gender in ['male','female','unknown','ignore']:
          self.online_names[org][name] = {}
          self.online_names[org][name]['gender'] = gender
          self.online_names[org][name]['count'] = row[3]
          #print "{0},{1},{2}".format(org,name,gender)
    except:
      print "download unsuccessful", sys.exc_info()[0]

  def org_name_gender(self,org,name):
    # org, name, manual_gender, single_name_gender
    #name = self.to_ascii(name)
    #org = self.to_ascii(org)
    org = org.replace(u",",u" ")
    f = codecs.open("byline_gender.log","a", "utf8")
    f.write(','.join([org,name])+ "\n")
    manual_gender=None
    #exclude = set(string.punctuation)

    try:
      inferred_gender = self.single_name_gender(name)
    except KeyError:
      asciiname = unicodedata.normalize('NFKD', name).encode('ascii','ignore')
      inferred_gender = self.single_name_gender(asciiname)
    if org in self.online_names.keys():
      if name in self.online_names[org].keys():
        manual_gender = self.online_names[org][name]['gender']
        f.write(','.join([u'MANUAL',org,name,manual_gender,u""])+ "\n")
      else:
        f.write(','.join([u'SEMI',org,name,inferred_gender,u"",str(self.online_names[org].keys())])+ "\n")
    else:
      f.write(','.join([u"INFERENCE",org,name,u"",inferred_gender])+ "\n")
    if manual_gender in ['male','female','unknown','ignore']:
      f.close()
      return manual_gender
    f.close()
    return inferred_gender

  def online_org_name_gender(self,org,name):
    #name = self.to_ascii(name)
    #org = self.to_ascii(org)
    try:
      #print u"{0},{1}".format(org,name)
      if org in self.online_names.keys():
        if name in self.online_names[org].keys():
          return self.online_names[org][name]['gender']
      return ""
    except UnicodeError:
      import pdb;pdb.set_trace()

  #org names is a dictionary in the format
  #org_names[org][name] = number of articles
  def export_org_names(self,org_names,f):
    o_names = self.online_names.copy()
    for org in org_names.keys():
        for name in org_names[org].keys():
            if org in o_names.keys() and name in o_names[org].keys():
              o_names[org].pop(name, None)
            f.write(','.join([org.replace(u",",u"+").replace(u" ",u"+"),name.replace(u",",u"+").replace(u" ",u"+"),self.single_name_gender(name),str(org_names[org][name]),self.online_org_name_gender(org,name)])+ "\n")
    for org in o_names.keys():
      for name in o_names[org].keys():
        f.write(','.join([org.replace(u",",u"+").replace(u" ",u"+"),name.replace(u",",u"+").replace(u" ",u"+"),self.single_name_gender(name),str(self.online_names[org][name]['count']),self.online_org_name_gender(org,name)])+ "\n")

  def strip_extras(self, byline):
    byline = re.sub(r'general sir ','',byline)
    byline = re.sub(r'american way: ','',byline)
    byline = re.sub(r'president','',byline)
    byline = re.sub(r'sir','',byline)
    byline = re.sub(r'gov(\.)?','',byline)
    byline = re.sub(r'rep(\.)?','',byline)
    byline = re.sub(r'prof','',byline)
    byline = re.sub(r'professor','',byline)
    byline = re.sub(r'.*?rt rev(d)?','',byline)
    byline = re.sub(r'\n.*','',byline)
    #telegraph cleaning
    #byline = re.sub(r'london-based.*','',byline)
    #byline = re.sub(r'london researcher.*','',byline)
    #byline = re.sub(r' of the.*','',byline)
    #byline = re.sub(r'telegraph tv critic','',byline)
    #byline = re.sub(r'broadcaster','',byline)
    #byline = re.sub(r'interview: ','',byline)
    #byline = re.sub(r'commentary: ','',byline)
    #byline = re.sub(r'telegraph travel writer','',byline)
    #byline = re.sub(r'on gigolo','',byline)
    byline = re.sub(r'more stories by ','',byline)
    byline = re.sub(r'view author.*','',byline)
    byline = re.sub(r'founder of.*','',byline)
    byline = re.sub(r' is (a)?.*','',byline)
    byline = re.sub(r' covers.*','',byline)
    byline = re.sub(r' in .*','',byline)
    byline = re.sub(r' info.*','',byline)
    byline = re.sub(r' writes .*','',byline)
    byline = re.sub(r'graphic(s)? by(:)?','',byline)
    byline = re.sub(r'compiled ','',byline)
    byline = re.sub(r'exclusive ','',byline)
    byline = re.sub(r'special dispatch' ,'',byline)
    byline = re.sub(r'as told to ','',byline)
    byline = re.sub(r' for .*','',byline)
    byline = re.sub(r'  .*','',byline)
    byline = re.sub(r'interview(ed)?(s)? ','',byline)
    byline = re.sub(r' at.*','',byline)
    byline = re.sub(r'^\| ','',byline)
    #cleaning Telegraph "by"
    byline = re.sub(r'^(by|By|BY) ','',byline)
    byline = re.sub(r'.*? by ','',byline)
    #remove multiple spaces in the middle of a name
    byline = re.sub(r'\s\s',' ',byline)
    byline = re.sub(r'\s\s',' ',byline)
    byline = re.sub(r'^dr ','',byline)
    byline = byline.strip().encode("utf-8")
    return byline

  # TODO: deal with commas
  def get_full_names(self, byline):
    if byline is None:
      return []
    byline = byline.strip().lower()
    if byline is None or re.search('[0-9]',byline) is not None:
      return []
    spaces = byline.count(' ')
    commas = byline.count(',')
    conjunctions = byline.count(' and ')
    semicolons = byline.count(';')
    bylines_result = []
    if(semicolons > 0):
      for name in byline.split(";"):
        if(name.count(";") > 0 or name.count(",") > 0 or name.count(" and ") > 0):
          bylines_result = bylines_result + self.get_full_names(name)
        else:
          bylines_result.append(self.strip_extras(name.strip()))
    elif(conjunctions >0):
      for name in byline.split(' and '):
        if(name.count(";") > 0 or name.count(",") > 0 or name.count(" and ") > 0):
          bylines_result = bylines_result + self.get_full_names(name)
        else:
          bylines_result.append(self.strip_extras(name.strip()))
    elif(commas == 0 and conjunctions == 0 and semicolons == 0):
      bylines_result.append(self.strip_extras(byline))
    elif(spaces >=2 and commas >=1):
      for name in byline.split(','):
        if(name.count(";") > 0 or name.count(",") > 0 or name.count(" and ") > 0):
          bylines_result = bylines_result + self.get_full_names(name)
        else:
          bylines_result.append(self.strip_extras(name.strip()))

    for junk in ['','based']:
      if junk in bylines_result:
        bylines_result.remove(junk)

    return bylines_result

  def get_first_names(self, byline):
    if byline is None or re.search('[0-9]',byline) is not None:
      return []
    byline = byline.strip()
    spaces = byline.count(' ')
    commas = byline.count(',')
    conjunctions = byline.count(' and ')
    bylines_result = []
    if(commas == 0 and conjunctions == 0):
      bylines_result.append(self.get_first_name_from_fullname(byline))
    if(conjunctions >0):
      for name in byline.split(' and '):
        bylines_result.append(self.get_first_name_from_fullname(name.strip()))
    if(spaces < 3 and commas == 1):
      bylines_result.append(self.get_first_name_from_reversename(byline))
    return bylines_result

  # assumes there's a single comma
  def get_first_name_from_reversename(self, byline):
    split_byline = [x.strip() for x in byline.split(',')]
    # set offset to 0 since the surname has already been stripped
    return self.get_first_name_from_fullname(split_byline[1], 0)

  def get_first_name_from_fullname(self, byline, offset=None):
    if(offset == None):
      offset = -1
    tokens = nltk.word_tokenize(byline)
    first_name = ""
    for i in range(0, (len(tokens)+offset)):
      if(tokens[i].count(".") > 0 or len(tokens[i]) == 1):
        continue
      return tokens[i]
    return None

  def test_first_names(self):
    test_strings = [
      ["J. Nathan Matias", ["Nathan"]],
      ["J. Matias", [None]],
      ["J Matias", [None]],
      ["J N Matias", [None]],
      ["J. N. Matias", [None]],
      ["JN Matias", ["JN"]],
      ["Matias, J. Nathan",["Nathan"]],
      ["Mishkin, Pamela", ["Pamela"]],
      ["Pamela Mishkin", ["Pamela"]],
      ["Nathan Matias and Pamela Mishkin", ["Nathan", "Pamela"]],
      ["J. Nathan Matias and Pamela Mishkin", ["Nathan", "Pamela"]]
    ]
    for test_string in test_strings:
      names = self.get_first_names(test_string[0])
      print 'Got: [%s]' % ', '.join(map(str, names))
      print 'Expected: [%s]' % ', '.join(map(str, test_string[1]))
      print '--------'
Beispiel #34
0
def run_pipeline_unlabelled_data():
	vec_mf = CountVectorizer(decode_error='replace')
	vec_ii = CountVectorizer(decode_error='replace')
	trans_mf = TfidfTransformer(use_idf=True)
	trans_ii = TfidfTransformer(use_idf=True)

	# Load Datasets
	train_data_mf = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/male_vs_female/docs_raw'))
	y_train_mf = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/male_vs_female/labels'))
	train_data_ii = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/individual_vs_institution/docs_raw'))
	y_train_ii = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/individual_vs_institution/labels'))
	test_data = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/docs_unlabelled_raw'))
	test_names = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/names_unlabelled_raw'))

	clf_mf = LinearSVC()
	clf_ii = LinearSVC()

	X_mf = trans_mf.fit_transform(vec_mf.fit_transform(train_data_mf))
	X_ii = trans_ii.fit_transform(vec_ii.fit_transform(train_data_ii))

	print 'SHAPE X_mf TRAIN DATA:', X_mf.shape
	print 'SHAPE X_ii TRAIN DATA:', X_ii.shape

	clf_mf.fit(X_mf, y_train_mf)
	clf_ii.fit(X_ii, y_train_ii)

	# Individual vs. Institution
	X_test_ii = trans_ii.transform(vec_ii.transform(test_data))
	prediction = clf_ii.predict(X_test_ii)

	print 'X_test_ii SHAPE:', X_test_ii.shape
	print 'PREDICTION DISTRIBUTION: %.2f / %.2f' % ((np.count_nonzero(prediction) / prediction.shape[0]), ((prediction.shape[0] - np.count_nonzero(prediction)) / prediction.shape[0]))

	print 'INDIVIDUALS:', np.where(prediction == 1)

	test_data_mf = []
	test_names_mf = []
	for idx in np.where(prediction == 1)[0]:
		#print 'IDX=%d; DATA=%r; NAME=%r' % (idx, test_data[idx], test_names[idx])
		test_data_mf.append(test_data[idx])
		test_names_mf.append(test_names[idx])

	print 'LEN TEST DATA:', len(test_data_mf)
	print 'LEN NAMES:', len(test_names_mf)

	# Gender classification
	X_test_mf = trans_mf.transform(vec_mf.transform(test_data_mf))

	print 'SHAPE X_test_mf:', X_test_mf.shape

	detector = GenderDetector('uk')
	names_pred = []
	for idx, n in enumerate(test_names_mf):
		try:
			cands = extract_name(n)
			for c in cands:
				gender = detector.guess(c)
				if (gender != 'unknown'):
					names_pred.append(0 if gender == 'female' else 1)
					break
		except:
			#names_pred.append(1)
			# Use SVM here
			names_pred.append(clf_mf.predict(X_test_mf[idx])[0])
			pass
		else:
			#names_pred.append(1)
			# Use SVM here
			names_pred.append(clf_mf.predict(X_test_mf[idx])[0])

	idx_female = np.where(np.array(names_pred) == 0)[0]
	idx_male = np.where(np.array(names_pred) == 1)[0]

	print 'FEMALE PREDICTION...'
	for i in idx_female:
		print test_data_mf[i]

	print 'MALE PREDICTION...'
	for i in idx_male:
		print test_data_mf[i]
Beispiel #35
0
def get_nominees(year):
    '''Nominees is a dictionary with the hard coded award
    names as keys, and each entry a list of strings. Do NOT change
    the name of this function or what it returns.'''
    if get_tweets(year) == False:
        return {}
    award_tweet_dict = get_award_tweet_dict(year)

    nominees = {award:[] for award in OFFICIAL_AWARDS}
    nominee_names = {award:[] for award in OFFICIAL_AWARDS}
    stoplist = ['best','-','award','for','or','made', 'in', 'a', 'by', 'performance', 'an','golden','globes','role','the']
    clean_award_names = {award:[a for a in award.split() if not a in stoplist] for award in OFFICIAL_AWARDS}
    cfd = {}

    person_award_identifiers = ["director","actor","actress","demille"]
    name_pattern = re.compile(r'[A-Z][a-z]+\s[A-Z][a-z]+')


    for award in OFFICIAL_AWARDS:
        if any(identifier in award for identifier in person_award_identifiers):
            for tweet in award_tweet_dict[award]:
                names = re.findall(name_pattern, tweet)
                for name in names:
                    award_not_in_name = True
                    for word in clean_award_names[award]+stoplist:
                        award_not_in_name = award_not_in_name and not word in name.lower().split()
                    if award_not_in_name:
                        nominee_names[award].append(name)

    detector = GenderDetector('us')
    for award in OFFICIAL_AWARDS:

        if 'actor' in award:
            cfd[award] = nltk.FreqDist(nominee_names[award])
            most_common = cfd[award].most_common(50)
            for name in most_common:
                gender = detector.guess(name[0].split()[0])
                if gender == 'male':
                    nominees[award].append(name[0])

        elif 'actress' in award:
            cfd[award] = nltk.FreqDist(nominee_names[award])
            most_common = cfd[award].most_common(50)
            for name in most_common:
                gender = detector.guess(name[0].split()[0])
                if gender == 'female':
                    nominees[award].append(name[0])

        elif any(identifier in award for identifier in ['director','demille']):
            cfd[award] = nltk.FreqDist(nominee_names[award])
            most_common = cfd[award].most_common(50)
            for name in most_common:
                gender = detector.guess(name[0].split()[0])
                if gender != 'unknown':
                    nominees[award].append(name[0])

        else:
            winner_stoplist = ["musical","comedy","motion", "picture","golden","globe","movie","television","best","or","tv","original","series","animated","feature","film","song","drama","-","rt","to","goes","foreign",'the']
            bigrams = []
            for tweet in award_tweet_dict[award]:
                if tweet[:2] == "RT":
                    continue
                tweet_bigrams = nltk.bigrams(tweet.split())
                trimmed = [b for b in tweet_bigrams if b[0].lower() not in winner_stoplist and b[1].lower() not in winner_stoplist and b[0][0] == b[0][0].upper()]
                bigrams += [b for b in trimmed if b[0][0] != "@" and b[1][0] != "@" and b[0][0] != "#" and b[1][0] != "#"]
            cfd[award] = nltk.FreqDist([' '.join(b) for b in bigrams])
            nominees[award] = [n[0] for n in cfd[award].most_common(5)]
        # print "\n"
    # nominees = {award: [a[0] for a in cfd[award].most_common(5)] for award in OFFICIAL_AWARDS}

    return nominees
Beispiel #36
0
import os
import sys
import json
from gender_detector import GenderDetector
from time import strftime, localtime, time

# record running time
start = time()
print('Starting Time: %s' % strftime("%a,  %b %d, %Y at %H:%M:%S", localtime()))
detector = GenderDetector('us')

with open("9ulovesu.json", 'rb') as file:
    data = json.load(file)

counter = 0
unknown = []
with open("9ulovesu.data", 'w') as file:
    for name in data['nodes']:
        counter += 1
        try:
            print(counter, name['name'].split()[0].encode('ascii'), detector.guess(name['name'].split()[0]), name['id'])
            file.write('%s, %s, %s\n' % (str(name['id']), name['name'].split()[0].encode('ascii'), detector.guess(name['name'].split()[0])))
        except:
            print(counter, name['name'].split()[0], 'unknown')
            file.write('%s, %s, unknown\n' % (str(name['id']), name['name'].split()[0].encode('ascii-8')))

print('\nEnd Time:  %s' % strftime("%a,  %b %d, %Y at %H:%M:%S", localtime()))
print('Execution Time: %.2f seconds' % (time()-start))