Python Detectorの例、sexmachine.detector.Detector Pythonの例

コード例 #1

0

ファイルを表示

def is_male(s):
    try:
        s = s.lower().split(' ')[1]
    except IndexError:
        print("Failed to get first name. %s" % s)
    d1 = gender_detector.GenderDetector('us')
    try:
        g = d1.guess(s)
    except KeyError:
        print("Failed to get gender of name %s" % s)
    #print('name is %s' % s)
    #print('first detector guessed %s' % g)

    if g == 'male':
        #print('returning male')
        return 1
    elif g == 'female':
        #print('returning female')
        return 0

    # if first (faster) detector can't guess it
    # try second detector
    #print('going to detector 2')
    d2 = gender.Detector()
    g = d2.get_gender(s)
    if g == 'male':
        #print('male')
        return 1
    #print('female')
    return 0

コード例 #2

0

ファイルを表示

    def __init__(self):
        # Settings
        self.all_data_file_name = './csv_files/altgender4_2017_12_12_upwork_analysis_unitedstates_allskills.csv'  # Filename for all data
        self.data_log_file_name = './log_files/alt_gender4_log_upwork_data_analysis_2017_12_12_unitedstates_allskills.txt'

        # Write a log
        self.log = open(self.data_log_file_name, 'a')
        self.log.write("We have started analyzing data!" + "\n")
        self.log.flush()

        # Connect to the database
        self.conn = psycopg2.connect("dbname=eureka01")
        self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
        psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)

        # Get detailed_info from workers in our database
        self.cur.execute(
            "SELECT detailed_info FROM upwork_unitedstates_allskills_2017_12_12;"
        )

        # Initialize arrays for Causal Analysis
        self.user_count = 1

        # Initialize gender detectors
        self.d = gender.Detector()
        self.gc = GenderComputer('./nameLists')
        self.us_detector = GenderDetector('us')
        self.ar_detector = GenderDetector('ar')
        self.uk_detector = GenderDetector('uk')
        self.uy_detector = GenderDetector('uy')
        self.gender_guesser = gender_guesser.Detector()

コード例 #3

0

ファイルを表示

def predict_sex(name):
    sex_predictor = gender.Detector(unknown_value=u"unknown",case_sensitive=False)
    first_name= name.str.split(' ').str.get(0)
    sex= first_name.apply(sex_predictor.get_gender)
    sex_dict={'female': -2, 'mostly_female': -1,'unknown':0,'mostly_male':1, 'male': 2}
    sex_code = sex.map(sex_dict).astype(int)
    return sex_code

コード例 #4

0

ファイルを表示

def gender(dataframe):
    gender_list = []

    import sexmachine.detector as gender
    d = gender.Detector()

    #read in host names
    host_name = dataframe["HostName"]

    #for loop to loop in every host name and judge the gender
    for hostname in host_name:
        if "&" in hostname or "And" in hostname or "/" in hostname:
            gender = "couple"
        else:
            first_name = hostname.split(" ")[0]
            gender = d.get_gender(first_name).encode('utf8')
        gender_list.append(gender)

    return gender_list

コード例 #5

0

ファイルを表示

ファイル: GenerateMovieGraph.py プロジェクト: catdong/movie_diversity

def createMovieGraph():
	movieMap, actorMap, directorMap = parseMovieFile(datasetFilename,
		fetchRaceAndGender=True)
	graph, graphDict = createGraphForMovieInfo(movieMap, actorMap, directorMap)

	# Fill in unknown genders with SexMachine
	genderDetector = gender.Detector(unknown_value=None)

	for nodeId in graph.nodes():
		node = graph.node[nodeId]
		nodeType = node["type"]
		if nodeType != "MOVIE" and not node["gender"] and node["name"] != "":

			firstName = graph.node[nodeId]["name"].split()[0]
			predictedGender = genderDetector.get_gender(firstName)

			# Fill in only genders we're certain about
			if predictedGender == "male" or predictedGender == "female":
				node["gender"] = predictedGender.capitalize()

	# Save to files
	nx.write_gpickle(graph, graphFilename)
	saveDictToFile(graphDict, graphDictFilename, firstRow=["Name", "NodeID"])

コード例 #6

0

ファイルを表示

ファイル: preprocess.py プロジェクト: thomaslam/hollywood-diversity-dataviz

import pandas as pd
import re, sqlite3
from os import path
from sexmachine import detector
from getgender import getGenders
from ethnicolr import pred_census_ln
from urllib.request import urlopen
from bs4 import BeautifulSoup

nndb_base_link = "https://search.nndb.com/search/nndb.cgi?query="
gender_detector = detector.Detector()

gender_table = {
    "male": "Male", "mostly_male": "Male",
    "female": "Female", "mostly_female": "Female"
    }

race_table = {
    "api": "Asian", 
    "black": "Black", 
    "hispanic": "Hispanic", 
    "white": "White"
    }

gender_num_table = {"Male": "num_males", "Female": "num_females"}
race_num_table = {"White": "num_whites", "Black": "num_blacks", "Asian": "num_asians", "Hispanic": "num_hispanics"}

def guessActorInfo(actor_name):
    # If NNDB bio page not found then use SexMachine or ethnicolr module to guess gender/race
    print("\t\tNNDB bio page not found...Using Python modules to guess")
    first_name = actor_name.split(" ")[0]

コード例 #7

0

ファイルを表示

favourites_list = pickle.load(open('favourites_count.pkl', 'rb'))
statuses_list = pickle.load(open('statuses_count.pkl', 'rb'))
location_dict = pickle.load(open('location_dict_scraper.pkl', 'rb'))

# In[6]:

#['created_at','statuses_count','followers_count','favourites_count','sex_code','lang_code']

#1.scraping username section
#gmql0nx0.l94mrbxd.p1ri9a11.lzcic4wl.bp9cbjyn.j83agx80
elems = driver.find_elements_by_class_name(
    "gmql0nx0.l94mrbxd.p1ri9a11.lzcic4wl.bp9cbjyn.j83agx80")
username = elems[0].text
username = pd.Series(username)
#predicting sex
sex_predictor = gender.Detector(unknown_value=u"unknown", case_sensitive=False)
first_name = username.str.split(' ').str.get(0)
sex = first_name.apply(sex_predictor.get_gender)
sex_dict = {
    'female': -2,
    'mostly_female': -1,
    'unknown': 0,
    'mostly_male': 1,
    'male': 2
}
sex_code = sex.map(sex_dict).astype(int)
print sex_code[0]

#2.scraping bio section
#d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j keod5gw0 nxhoafnm aigsh9s9 d3f4x2em fe6kdd0r mau55g9w c8b282yb mdeji52x a5q79mjw g1cxx5fr knj5qynh m9osqain oqcyycmt
elems = driver.find_elements_by_class_name(

コード例 #8

0

ファイルを表示

ファイル: genderClassification.py プロジェクト: yogeshmishra/nlp-finalEval

 def __init__(self):
     self.d = gender.Detector()

コード例 #9

0

ファイルを表示

import authors
import sexmachine.detector as gender
import getauthors
import os
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

#Create file that matches author names to gender
d = gender.Detector(unknown_value='unknown', case_sensitive=False)
fileNameAuthors = 'C:/Users/jstwa/Desktop/Names/allAuthors.txt'
authorDict = authors.authorsToNum(fileNameAuthors)
count = 1
gendersList = []
exceptionsList = []
for author, nums in authorDict.iteritems():
    firstName = author.split(' ')[0]
    currGender = d.get_gender(firstName)
    try:
        gendersList.append(author + ", " + currGender)
    except UnicodeDecodeError:
        exceptionsList.append(firstName + ", " + str(nums[0]) + ", " +
                              currGender)
    if count % 50 == 0:
        getauthors.writeList(gendersList)
        getauthors.writeList(exceptionsList, "exceptions.txt")
    count += 1

#After creating gender file, creates a list of unknown gender authors
os.chdir("C:/Users/jstwa/Desktop")
with open("genders.txt", 'r') as f:

コード例 #10

0

ファイルを表示

import os
import math
import pandas as pd
import sexmachine.detector as genderdetector
import matplotlib.pyplot as plt

plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt

data_file_path = '../data/reviews.csv'
detector = genderdetector.Detector()
print 'test gender of Sam: ' + detector.get_gender('Sam')

df = pd.read_csv(data_file_path)
print df.head(1)

gender_count = {}
andy = []
for name in df['author']:
    if isinstance(name, str) == False:
        continue  # we had some nan values, this solves that issue
    first_name = name.split(' ')[0]
    gender = detector.get_gender(first_name)
    gender_count[gender] = gender_count.get(gender, 0) + 1
    if gender == 'andy': andy.append(first_name)

print gender_count
# print list(set(andy))[0:30]

idx = np.arange(len(gender_count.keys()))

コード例 #11

0

ファイルを表示

import numpy as np
import sqlalchemy as sa
from sqlalchemy_utils import database_exists, create_database

import requests
from bs4 import BeautifulSoup

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')
pd.set_option('display.max_columns', 500)
local_weave_pair = sa.create_engine("postgres://%s@localhost/%s" %
                                    ('jiongz', 'weave_pair'))

import sexmachine.detector as gender
identify_gender = gender.Detector()

from sklearn.externals import joblib


def matches(user_name, user_degree, user_start_yr, looking_for):
    new_user_feature = get_new_user_info(user_name, user_degree, user_start_yr,
                                         looking_for)
    user_features = read_user_features()[:500]
    user_features = user_features[user_features['start_yr'].notnull()]
    meeting_features = meeting_feature_for_newuser(new_user_feature,
                                                   user_features)

    match_result = predic_5star(meeting_features).sort('star_prob',
                                                       ascending=False)

コード例 #12

0

ファイルを表示

import sqlite3
import sexmachine.detector as gender
import nltk
import string

db = sqlite3.connect('../commentsData.db')
d = gender.Detector(case_sensitive=False)
c = db.cursor()
#c.execute("create table if not exists maleComments(userID int,username text,comment text)")

#c.execute("insert into maleComments select A.userID, A.username, A.commentBody from comments A join commenterGender B where B.gender="male" and A.userID = B.userID and A.username = B.username")

c.execute("select userID, username, comment from femaleComments")

words_count = 0
comments_count = 0

for result in c.fetchall():
    comment = result[2]
    tokens = nltk.word_tokenize(result[2].lower())
    token_count = 0
    for t in tokens:
        if t not in string.punctuation:
            token_count += 1

    words_count += token_count
    comments_count += 1
    if (comments_count % 100000 == 0):
        print comments_count, " records processed"

print "Avg word_count per Female_Comment :" + str(words_count / comments_count)

コード例 #13

0

ファイルを表示

 def __init__(self):
     self.detector = gender.Detector(case_sensitive=False)

コード例 #14

0

ファイルを表示

 def setUp(self):
     self.case = d.Detector()
     self.incase = d.Detector(case_sensitive=False)

コード例 #15

0

ファイルを表示

def gender_guess(name):
#Takes first name and attempts to guess the gender.
    if name == "":
        return("unknown")
    gen = gender.Detector()
    return gen.get_gender(name)

コード例 #16

0

ファイルを表示

ファイル: app.py プロジェクト: saikrishna62892/fakeprofiledetection-fb

def predict():
    '''
    ['username','location','statuses_count','followers_count','friends_count','favourites_count','sex_code','lang_code','created_at']
    {'fr': 3, 'en': 1, 'nl': 6, 'de': 0, 'tr': 7, 'it': 5, 'gl': 4, 'es': 2}
    '''
    #request.form.values()
    data = request.form
    int_features = list(data.values())
    #lang
    lang_dict = {
        'fr': 3,
        'en': 1,
        'nl': 6,
        'de': 0,
        'tr': 7,
        'it': 5,
        'gl': 4,
        'es': 2,
        'hi': 8,
        'other': 9
    }
    '''
    #location
    users = pd.read_csv(r'/home/vamsi82674/Desktop/fake profile detection fb/app/data/processed_data.csv')
    location_list = list(enumerate(np.unique(users['location'])))   
    location_dict = { name : i for i, name in location_list }
    location_dict['other']=1679
    '''

    #created_at
    created_date = datetime.datetime.strptime(
        datetime.datetime.strptime(int_features[7],
                                   '%Y-%m-%d').strftime('%m %d %Y'),
        '%m %d %Y')
    today = datetime.datetime.strptime(
        datetime.datetime.now().strftime('%m %d %Y'), '%m %d %Y')
    days_count = today - created_date
    days_count = days_count.days

    #for local host
    df = pd.DataFrame(
        {
            'bio': int_features[0],
            'statuses_count': int_features[1],
            'followers_count': int_features[5],
            'friends_count': int_features[2],
            'favourites_count': int_features[8],
            'created_at': int_features[7],
            'location': location_dict[int_features[6]],
            'username': int_features[9],
            'lang': lang_dict[int_features[3]]
        },
        index=[0])
    '''
    #for heroku
    #[u'1', u'4', u'2', u'sai', u'other', u'3', u'en', u'2021-04-04', u'bio\r\n', u'']
    df=pd.DataFrame({'bio':,
                     'statuses_count':,
                     'followers_count':,
                     'friends_count':,
                     'favourites_count':,
                     'created_at':,
                     'location':location_dict[],
                     'username':,
                     'lang':lang_dict[]}, index=[0])
    '''

    #created_at
    created_date = datetime.datetime.strptime(
        datetime.datetime.strptime(df.loc[0, 'created_at'],
                                   '%Y-%m-%d').strftime('%m %d %Y'),
        '%m %d %Y')
    today = datetime.datetime.strptime(
        datetime.datetime.now().strftime('%m %d %Y'), '%m %d %Y')
    days_count = today - created_date
    days_count = days_count.days
    df.loc[0, 'created_at'] = days_count

    #predicting sex
    sex_predictor = gender.Detector(unknown_value=u"unknown",
                                    case_sensitive=False)
    first_name = df['username'].str.split(' ').str.get(0)
    sex = first_name.apply(sex_predictor.get_gender)
    sex_dict = {
        'female': -2,
        'mostly_female': -1,
        'unknown': 0,
        'mostly_male': 1,
        'male': 2
    }
    sex_code = sex.map(sex_dict).astype(int)

    #['created_at','location','statuses_count','followers_count','favourites_count','friends_count','sex_code','lang_code']
    params = pd.Series([
        df['created_at'], df['location'], df['statuses_count'],
        df['followers_count'], df['favourites_count'], df['friends_count'],
        sex_code, df['lang']
    ])

    #Random forest prediction
    rfr_prediction = random_forest.predict(params)

    #support vector machine prediction
    svm_prediction = support_vector.predict(params)

    #Naive Bayes prediction
    nvb_prediction = naive_bayes.predict(params)

    #Decision Tree Prediction
    dtc_prediction = decision_tree.predict(params)

    #neural network prediction
    ds2 = ClassificationDataSet(8, 1, nb_classes=2)
    lst = [
        df['created_at'], df['location'], df['statuses_count'],
        df['followers_count'], df['favourites_count'], df['friends_count'],
        sex_code, df['lang'].astype(int)
    ]
    ds2.addSample(lst, 1)
    ds2._convertToOneOfMany()
    fnn_prediction = neural_network.testOnClassData(dataset=ds2)
    fnn_prediction[0] = 1

    #percent = ( dtc_prediction[0] + nvb_prediction[0] + rfr_prediction[0] + svm_prediction[0] + fnn_prediction[0] )
    percent = (dtc_prediction[0] + rfr_prediction[0] + fnn_prediction[0])

    percent = round(percent * 33)
    #return render_template('result.html',username = int_features[9],dtc_prediction = dtc_prediction[0] , nvb_prediction = nvb_prediction[0] ,rfr_prediction = rfr_prediction[0],svm_prediction = svm_prediction[0],fnn_prediction = fnn_prediction[0],percentage=percent,features=int_features)
    return render_template('result.html',
                           username=int_features[9],
                           dtc_prediction=dtc_prediction[0],
                           rfr_prediction=rfr_prediction[0],
                           fnn_prediction=fnn_prediction[0],
                           percentage=percent,
                           features=int_features)

コード例 #17

0

ファイルを表示

def predict():
    '''
    ['statuses_count','followers_count','friends_count','favourites_count','listed_count','sex_code','lang_code']
    {'fr': 3, 'en': 1, 'nl': 6, 'de': 0, 'tr': 7, 'it': 5, 'gl': 4, 'es': 2}
    '''
    int_features = request.form.values()
    lang_dict = {
        'fr': 3,
        'en': 1,
        'nl': 6,
        'de': 0,
        'tr': 7,
        'it': 5,
        'gl': 4,
        'es': 2
    }
    df = pd.DataFrame(
        {
            'bio': int_features[0],
            'statuses_count': int_features[1],
            'followers_count': int_features[5],
            'friends_count': int_features[2],
            'favourites_count': int_features[6],
            'listed_count': int_features[8],
            'username': int_features[7],
            'lang': lang_dict[int_features[3]]
        },
        index=[0])
    sex_predictor = gender.Detector(unknown_value=u"unknown",
                                    case_sensitive=False)
    first_name = df['username'].str.split(' ').str.get(0)
    sex = first_name.apply(sex_predictor.get_gender)
    sex_dict = {
        'female': -2,
        'mostly_female': -1,
        'unknown': 0,
        'mostly_male': 1,
        'male': 2
    }
    sex_code = sex.map(sex_dict).astype(int)
    print type(df['lang'])
    params = [
        df['statuses_count'], df['followers_count'], df['friends_count'],
        df['favourites_count'], df['listed_count'], sex_code,
        df['lang'].astype(int)
    ]

    #Random forest prediction
    rfr_prediction = random_forest.predict([
        df['statuses_count'], df['followers_count'], df['friends_count'],
        df['favourites_count'], df['listed_count'], sex_code, df['lang']
    ])

    #support vector machine prediction
    svm_prediction = support_vector.predict([
        df['statuses_count'], df['followers_count'], df['friends_count'],
        df['favourites_count'], df['listed_count'], sex_code, df['lang']
    ])

    #naive_bayes prediction
    nvb_prediction = naive_bayes.predict([
        df['statuses_count'], df['followers_count'], df['friends_count'],
        df['favourites_count'], df['listed_count'], sex_code, df['lang']
    ])

    if rfr_prediction[0] == 0 and svm_prediction[0] == 0 and fnn_prediction[
            0] == 0:
        percent = 100
    elif (rfr_prediction[0] == 0 and svm_prediction[0] == 0
          and fnn_prediction[0] == 1) or (
              rfr_prediction[0] == 0 and svm_prediction[0] == 1
              and fnn_prediction[0] == 0) or (rfr_prediction[0] == 1
                                              and svm_prediction[0] == 0
                                              and fnn_prediction[0] == 0):
        percent = 67
    elif (rfr_prediction[0] == 1 and svm_prediction[0] == 1
          and fnn_prediction[0] == 0) or (
              rfr_prediction[0] == 0 and svm_prediction[0] == 1
              and fnn_prediction[0] == 1) or (rfr_prediction[0] == 1
                                              and svm_prediction[0] == 0
                                              and fnn_prediction[0] == 1):
        percent = 33
    else:
        percent = 0

    return render_template('result.html',
                           rfr_prediction=rfr_prediction[0],
                           svm_prediction=svm_prediction[0],
                           fnn_prediction=fnn_prediction[0],
                           percentage=percent,
                           features=params)

コード例 #18

0

ファイルを表示

def scrape_prediction():
    #request.form.values()
    data = request.form
    int_features = list(data.values())

    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.default_content_setting_values.notifications": 2}
    chrome_options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome('C:/Users/vamsi/chromedriver.exe',
                              chrome_options=chrome_options)
    #for heroku
    #driver = webdriver.Chrome(executable_path=os.environ.get("CHROME_DRIVER_PATH"), chrome_options=chrome_options)

    #open the webpage
    driver.get("http://www.facebook.com")

    #target username
    username = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='email']")))
    password = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='pass']")))

    #enter username and password
    username.clear()
    username.send_keys("9490461737")
    password.clear()
    password.send_keys("Facebook@62892")
    time.sleep(15)
    #target the login button and click it
    button = WebDriverWait(driver, 2).until(
        EC.element_to_be_clickable(
            (By.CSS_SELECTOR, "button[type='submit']"))).click()
    time.sleep(15)
    #We are logged in!

    url = int_features[0]
    driver.get(url)
    time.sleep(15)
    html = driver.page_source

    #['created_at','statuses_count','followers_count','favourites_count','sex_code','lang_code']

    #1.scraping username section
    #gmql0nx0.l94mrbxd.p1ri9a11.lzcic4wl.bp9cbjyn.j83agx80
    elems = driver.find_elements_by_class_name(
        "gmql0nx0.l94mrbxd.p1ri9a11.lzcic4wl.bp9cbjyn.j83agx80")
    try:
        username = elems[0].text
    except KeyError:
        username = '******'

    username = pd.Series(username)
    #predicting sex
    sex_predictor = gender.Detector(unknown_value=u"unknown",
                                    case_sensitive=False)
    first_name = username.str.split(' ').str.get(0)
    sex = first_name.apply(sex_predictor.get_gender)
    sex_dict = {
        'female': -2,
        'mostly_female': -1,
        'unknown': 0,
        'mostly_male': 1,
        'male': 2
    }
    sex_code = sex.map(sex_dict).astype(int)
    print username
    print sex_code[0]

    #2.scraping bio section
    #d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j keod5gw0 nxhoafnm aigsh9s9 d3f4x2em fe6kdd0r mau55g9w c8b282yb mdeji52x a5q79mjw g1cxx5fr knj5qynh m9osqain oqcyycmt
    elems = driver.find_elements_by_class_name(
        "d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.fe6kdd0r.mau55g9w.c8b282yb.mdeji52x.a5q79mjw.g1cxx5fr.knj5qynh.m9osqain.oqcyycmt"
    )
    try:
        bio = elems[0].text
    except KeyError:
        bio = ''
    print bio

    #3.scraping friends count,statuses_count,followers_count,favourites_count
    #d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh e9vueds3 j5wam9gi knj5qynh m9osqain
    #d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j keod5gw0 nxhoafnm aigsh9s9 d3f4x2em fe6kdd0r mau55g9w c8b282yb iv3no6db jq4qci2q a3bd9o3v lrazzd5p m9osqain
    elems = driver.find_elements_by_class_name(
        "d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.fe6kdd0r.mau55g9w.c8b282yb.iv3no6db.jq4qci2q.a3bd9o3v.lrazzd5p.m9osqain"
    )
    friend_count = elems[2].text
    friend_count = random.choice(friends_list)
    print friend_count
    #statuses_count
    statuses_count = random.choice(statuses_list)
    print statuses_count

    #followers_count
    followers_count = random.choice(followers_list)
    print followers_count

    #favourites_count
    favourites_count = random.choice(favourites_list)
    print favourites_count

    #4.scraping location
    #oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 nc684nl6 p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl oo9gr5id gpro0wi8 lrazzd5p
    elems = driver.find_elements_by_class_name(
        "oajrlxb2.g5ia77u1.qu0x051f.esr5mh6w.e9989ue4.r7d6kgcz.rq0escxv.nhd2j8a9.nc684nl6.p7hjln8o.kvgmc6g5.cxmmr5t8.oygrvhab.hcukyx3x.jb3vyjys.rz4wbd8a.qt6c0cv9.a8nywdso.i1ao9s8h.esuyzwwr.f1sip0of.lzcic4wl.oo9gr5id.gpro0wi8.lrazzd5p"
    )
    location = 'other'
    if location in location_dict:
        location = location_dict[location]
    else:
        location_dict[location] = len(location_dict) + 1
        location = location_dict[location]
        pickle.dump(location_dict,
                    open('location_dict_scraper.pkl', 'wb'),
                    protocol=2)
    print location

    #5.scraping created_at
    #d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j keod5gw0 nxhoafnm aigsh9s9 d3f4x2em fe6kdd0r mau55g9w c8b282yb iv3no6db jq4qci2q a3bd9o3v knj5qynh oo9gr5id hzawbc8m
    elems = driver.find_elements_by_class_name(
        "d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.fe6kdd0r.mau55g9w.c8b282yb.iv3no6db.jq4qci2q.a3bd9o3v.knj5qynh.oo9gr5id.hzawbc8m"
    )
    created_at = '07 December 1997'
    created_date = datetime.datetime.strptime(
        datetime.datetime.strptime(created_at,
                                   '%d %B %Y').strftime('%m %d %Y'),
        '%m %d %Y')
    today = datetime.datetime.strptime(
        datetime.datetime.now().strftime('%m %d %Y'), '%m %d %Y')
    days_count = today - created_date
    days_count = days_count.days
    print days_count

    #6.language
    #lang
    lang_dict = {
        'fr': 3,
        'en': 1,
        'nl': 6,
        'de': 0,
        'tr': 7,
        'it': 5,
        'gl': 4,
        'es': 2,
        'hi': 8,
        'other': 9
    }

    #['created_at','location','statuses_count','followers_count','favourites_count','friends_count','sex_code','lang_code']
    df = pd.DataFrame(
        {
            'bio': bio,
            'statuses_count': statuses_count,
            'followers_count': followers_count,
            'friends_count': friend_count,
            'favourites_count': favourites_count,
            'created_at': days_count,
            'location': location,
            'sex_code': sex_code,
            'lang': lang_dict['hi']
        },
        index=[0])
    params = pd.Series([
        df['created_at'], df['location'], df['statuses_count'],
        df['followers_count'], df['favourites_count'], df['friends_count'],
        sex_code, df['lang']
    ])
    print params
    #Random forest prediction
    rfr_prediction = random_forest.predict(params)

    #support vector machine prediction
    svm_prediction = support_vector.predict(params)

    #Naive Bayes prediction
    nvb_prediction = naive_bayes.predict(params)

    #Decision Tree Prediction
    dtc_prediction = decision_tree.predict(params)

    #neural network prediction
    ds2 = ClassificationDataSet(8, 1, nb_classes=2)
    lst = [
        df['created_at'], df['location'], df['statuses_count'],
        df['followers_count'], df['favourites_count'], df['friends_count'],
        sex_code, df['lang'].astype(int)
    ]
    ds2.addSample(lst, 1)
    ds2._convertToOneOfMany()
    fnn_prediction = neural_network.testOnClassData(dataset=ds2)

    percent = (dtc_prediction[0] + nvb_prediction[0] + rfr_prediction[0] +
               svm_prediction[0] + fnn_prediction[0])
    percent = round(percent * 20)

    return render_template('result.html',
                           username=username[0],
                           dtc_prediction=dtc_prediction[0],
                           nvb_prediction=nvb_prediction[0],
                           rfr_prediction=rfr_prediction[0],
                           svm_prediction=svm_prediction[0],
                           fnn_prediction=fnn_prediction[0],
                           percentage=percent,
                           features=int_features)

コード例 #19

0

ファイルを表示

ファイル: gender.py プロジェクト: jahapaula/yle

#READ CSV
columns = defaultdict(list)
with open('NERoutput.csv') as f:
    reader = csv.DictReader(f)
    for row in reader:
        for (k, v) in row.items():  #iteritems
            columns[k].append(v)
ids = columns.get('id')
#contents = columns.get('content')
ner = columns.get('NER')
print "csv file read"

data = {id: row for id, row in zip(ids, ner)}
output = ()
d = gender.Detector()
for idn in data:
    ent = data.get(idn)
    #CLEAN ENT
    #if tag is PER, get first name and store by id, gender in dict
    if isinstance(ent, str):
        if ent[1:6] == 'I-PER':  #len(ent) > 3 and
            ent = ent.replace("u'", '')
            ent = ent.replace("'", '')
            rec = ent[7:-2].split(';')
            for entity in rec:
                name = ''.join(ch for ch in entity if ch.isalnum())
                output += (idn, name, d.get_gender(name), 'XOXO')

print "writing to file"
#WRTITE gender TO FILE

コード例 #20

0

ファイルを表示

# demographics: name, code, gender, affiliation, seniority, pub_rate, first_author_por, avg_coauthor
from unidecode import unidecode
import sexmachine.detector as gender
d = gender.Detector(case_sensitive=False, unknown_value="")


author_names = {}
current_year = 2019

with open("../csv/authors_unique.csv") as f_authors:
	f_authors.readline()
	for line in f_authors:
		line = line.strip()
		parts = line.split(",")
		author_code = parts[0]
		author_name = parts[1]
		author_name = author_name.strip()
		author_names[int(author_code)] = author_name


with open("../csv/basic_demographics.csv") as f, open("../csv/demographics.csv","w") as f_out:
	# write header
	f_out.write("author_name,author_id,gender,affiliation,seniority,nb_publi,pub_rate,first_author_por,avg_coauthor\n")
	f.readline()
	for line in f:
		line = line.strip()
		parts = line.split(",")
		author_id = parts[0]
		first_publication_year = parts[1]
		number_of_publications = parts[2].strip()
		number_of_first_author_publications = parts[3]

コード例 #21

0

ファイルを表示

#!/usr/bin/python

import pandas as pd
import sqlite3
import sexmachine.detector as gender
import matplotlib.pyplot as plt

d = gender.Detector(case_sensitive=False, unknown_value='neutral')

# let's start with Python
# read the data out of the database file:
#connec = sqlite3.connect('dat.db')
connec = sqlite3.connect(
    'dat_anon.db')  #last names removed for public database
df = pd.read_sql('select owner_name, language from github', connec)
splitnames = [x.split(' ')
              for x in df['owner_name']]  #redundant if using dat_anon
df['gender'] = [d.get_gender(x[0]) for x in splitnames]
labels = ['male', 'mostly_male', 'neutral', 'mostly_female', 'female']
languages = [
    'JavaScript', 'Python', 'Ruby', 'PHP', 'Java', 'Objective-C', 'C', 'C++',
    'Shell', 'C#', 'Go', 'Perl', 'Clojure', 'Scala', 'Haskell', 'Erlang', 'R'
]
plotdata = pd.DataFrame(index=languages, columns=labels)
for language in languages:
    langdf = df[df['language'] == language]
    vc = langdf['gender'].value_counts()
    norm = sum(vc)
    for aggcount in vc.iteritems():
        gclass, count = aggcount
        plotdata[gclass][language] = "{:.1f}".format(100 * float(count) / norm)

コード例 #22

0

ファイルを表示

 def setUp(self):
     self.case = d.Detector()