Ejemplo n.º 1
0
def determine_gender_from_wiki(firstname, lastname=None):
    #    print('trying to determine gender of '+str(firstname)+' '+str(lastname))
    if lastname == None:
        return Genderizer.detect(firstName=firstname)
    try:
        p = wikipedia.page(firstname + lastname)
        #print p.title
        text_from_wikipedia = p.content
        if len(text_from_wikipedia) > 1000:
            text_from_wikipedia = text_from_wikipedia[0:1000]
        #        print('text from wiki:'+text_from_wikipedia)
        return Genderizer.detect(firstName=firstname, text=text_from_wikipedia)
    except wikipedia.exceptions.PageError:
        return Genderizer.detect(firstName=firstname)
    except wikipedia.exceptions.DisambiguationError as e:
        print e.options
        first_option = e.options[0]
        print first_option
        try:
            p = wikipedia.page(first_option)
            print p.title
            text_from_wikipedia = p.content
            if len(text_from_wikipedia) > 1000:
                text_from_wikipedia = text_from_wikipedia[0:1000]
                #        print('text from wiki:'+text_from_wikipedia)
            return Genderizer.detect(firstName=firstname,
                                     text=text_from_wikipedia)
        except wikipedia.exceptions.DisambiguationError as e:
            # this 2nd exception is because sometimes wiki thinks the query is ambiguous even i fyou choose the first option
            return Genderizer.detect(firstName=firstname)
Ejemplo n.º 2
0
    def on_data(self, data):
        sys.stdout.write("loading data rom one tweet")
        writeLog("loading data rom one tweet")
        # Load Json from Twitter API
        tweet = json.loads(data)
        try:
            tweet["_id"] = str(tweet["id"])  # Get the ID
            lang = tweet["lang"]
            name = tweet["user"]["name"]

            # Gender Analysis:
            name_list = name.split()
            name = name_list[0]

            gender = Genderizer.detect(firstName=name)
            tweet["user"]["gender"] = gender

            # Sentiment Analysis
            analysed_result = classifier.doSentimentAnalysis(str(tweet["text"]))

            if str(lang) == "en":  # only analyse english texts
                if not hasAlreadySentiment(tweet):
                    tweet = updateSentimentDoc(
                        tweet,
                        analysed_result["sentiment"],
                        analysed_result["polarity"],
                        analysed_result["subjectivity"],
                    )
                    self.processed_tweets += 1
                else:
                    self.ignored_tweets += 1
            else:  # otherwise ignore it!
                self.ignored_tweets += 1

            # Update place coordinates to work with GeoJson
            tweet = updatePlaceDoc(tweet)
            tweet = updateCoordinate(tweet)

            # Update date fields for better reporting
            tweet = updateDateDay(tweet)
            # Update Sentiment
            tweet = updateSentiment(tweet)

            doc = db.save(tweet)  # Save tweet into CouchDB
            print("Obtained Tweet ID: " + str(tweet["id"]))
            self.tweet_count += 1
            if self.tweet_count % 10000 == 0:
                # Notify when 10000 new tweets have been stored on database
                msg_update = "10K new tweets on database: " + settings.database
                # emailer.sendEmail(message=str(msg_update))
        except:
            sys.stdout.write("Twitter API error")
            writeLog("Twitter API error")
            pass
        return True
Ejemplo n.º 3
0
    def on_data(self, data):
        #Load Json from Twitter API
        tweet = json.loads(data)
        try:
            tweet["_id"] = str(tweet['id'])  #Get the ID
            lang = tweet['lang']
            name = tweet['user']['name']

            #Gender Analysis:
            name_list = name.split()
            name = name_list[0]

            gender = Genderizer.detect(firstName=name)
            tweet['user']['gender'] = gender

            #Sentiment Analysis
            analysed_result = classifier.doSentimentAnalysis(str(
                tweet['text']))

            if str(lang) == 'en':  #only analyse english texts
                if not hasAlreadySentiment(tweet):
                    tweet = updateSentimentDoc(tweet,
                                               analysed_result["sentiment"],
                                               analysed_result["polarity"],
                                               analysed_result["subjectivity"])
                    self.processed_tweets += 1
                else:
                    self.ignored_tweets += 1
            else:  #otherwise ignore it!
                self.ignored_tweets += 1

            #Update place coordinates to work with GeoJson
            tweet = updatePlaceDoc(tweet)

            doc = db.save(tweet)  #Save tweet into CouchDB
            # print("Obtained Tweet ID: " + str(tweet['id']))
            self.tweet_count += 1
            if (self.tweet_count % 10000 == 0):
                #Notify when 10000 new tweets have been stored on database
                msg_update = '10K new tweets on database: ' + settings.database
                emailer.sendEmail(message=str(msg_update))
        except:
            writeLog("Twitter API error")
            pass
        return True
Ejemplo n.º 4
0
def name_match(input):
    #remove the last non-alphabet
    try:
        input = re.search('^([^\w]+)?[a-zA-Z]{3,}([^\w]+)?$',\
         input).group()
        input = re.sub('(^[^\w]+)|([^\w]+$)', '', input)
    except:
        return None
    #check if it is a none-name word
    if bool(re.search(re_none_name, \
     input.lower())):
        return None
    #check if it is a title
    if bool(re.search(re_tile, \
     input.lower())):
        return None
    input = input.title()
    try:
        output = d.get_gender(input)
        if output in [u'male', u'mostly_male']:
            return 'male'
        if output in [u'female', u'mostly_female']:
            return 'female'
        if output in [u'andy']:
            return 'unknown_gender'
    except:
        pass
    try:
        output = Genderizer.detect(firstName=input)
        if output in [u'male']:
            return 'male'
        if output in [u'female']:
            return 'female'
    except:
        pass
    return None
Ejemplo n.º 5
0
def identify_gender(name):
    gender = Genderizer.detect(firstName=name)
    return gender
Ejemplo n.º 6
0
client = MongoClient()
db = client.tvshows
collection = db.users

tweets = db.tweets
count = 0
for user_md in collection.find({"gender":{"$exists":False}},{"name":1, "description":1,"id_str":1}):
		name = user_md['name']
		description = user_md['description']
		id_str = user_md['id_str']
		user_gender = ''
		user_gender = d.get_gender(name)
		if(user_gender == 'andy'):
			try:
				if(description != ''):
					user_gender = Genderizer.detect(firstName = name, text = description)
			except ZeroDivisionError:
				user_gender = 'None'
			if(user_gender == 'None'):
				for user_tweets in tweets.find({"user_id_str":id_str},{"text":1}).limit(10):
					twt = user_tweets['text']
					try:
						user_gender = Genderizer.detect(firstName = name, text = twt)
						if(user_gender != 'None'):	
							break
					except ZeroDivisionError:
						user_gender = 'None'
			if(user_gender == 'None'):
				list = []
				list.append(name)
				response_list = Genderize().get(list)
Ejemplo n.º 7
0
collection2 = db2.pages_editors

#cursor for searching inside the collection. Limit is used to get the record of first 20 editors in the list but to check it on whole db just remove limit.
cursor2 = collection2.find().limit(22)
cursor1 = collection1.find().limit(22)

#loop for searching pages
for doc2 in cursor2:

    #loop for searching editors
    for doc1 in cursor1:

        #this conditions detects that editor has editted which page
        if (doc1['_id']) in (doc2['editors_list']):
            print(
                '-------------------------------------------------------------'
            )
            print('Page Title = ' + doc2['page_title'])
            print('\n')
            print(doc1['id'])
            print('\n')
            text = doc1['raw_content'].ljust(120)[:120].strip()
            print("gender by text")

            #Raw text given as an input to the function in genderizer library
            print(Genderizer.detect(text=text))
            print('\n')
        else:
            break

print("end")
Ejemplo n.º 8
0
import gender_guesser.detector as gender
d = gender.Detector()
from genderizer.genderizer import Genderizer as g

# argument handling
if len(sys.argv) is not 2:
    sys.exit('Please enter a school name! (Options: Penn, Brown, or Harvard)')

school = sys.argv[1].lower()

if school != 'penn' and school != 'harvard' and school != 'brown':
    sys.exit('Not a valid school. Try Penn, Brown, or Harvard!')

with open('data/honorees_' + school + '_clean.csv',
          'rb') as honorees, open('data/genders_' + school + '.csv',
                                  'wb') as output:
    r = csv.reader(honorees)
    w = csv.writer(output)

    w.writerow(['Name', 'Year', 'Gender Guesser', 'Genderizer', 'Final'])

    for row in r:
        name = row[0]
        names = name.split(' ')
        gender = d.get_gender(names[0])
        gender2 = g.detect(firstName=names[0])
        w.writerow([
            name, row[1], gender, 'unknown' if gender2 is None else gender2,
            gender if gender == gender2 else 'unknown'
        ])
Ejemplo n.º 9
0
    def on_data(self, data):
	sys.stdout.write("loading data rom one tweet")
	writeLog("loading data rom one tweet")
        #Load Json from Twitter API
        tweet = json.loads(data)
        try:
            tweet["_id"] = str(tweet['id']) #Get the ID
            lang = tweet['lang']
            name = tweet['user']['name']

            #Gender Analysis:
            name_list = name.split()
            name = name_list[0]
	    sys.stdout.write("Gender analysis success")
	    writeLog("Gender analysis success")

            gender = Genderizer.detect(firstName = name)
            tweet['user']['gender'] = gender

            #Sentiment Analysis
            #analysed_result = classifier.doSentimentAnalysis(str(tweet['text']))
	    sys.stdout.write("sentiment analysis success")
	    writeLog("sentiment analysis success")

            if str(lang) == 'es': #only analyse spanish texts
                if not hasAlreadySentiment(tweet):
                    #tweet = updateSentimentDoc(tweet,analysed_result["sentiment"],analysed_result["polarity"],analysed_result["subjectivity"])
                    self.processed_tweets += 1
                else:
                    self.ignored_tweets += 1
            else: #otherwise ignore it!
                self.ignored_tweets += 1

            #Update place coordinates to work with GeoJson
            tweet = updatePlaceDoc(tweet)
	    writeLog("place updated")
            tweet = updateCoordinate(tweet)
	    writeLog("coordinates updated")

            #Update date fields for better reporting
            tweet = updateDateDay(tweet)
	    writeLog("day updated")
            #Update Sentiment
	    try:
	       tweet = updateSentiment(tweet)
               writeLog("sentiment updated")
	    except Exception, err:
               writeLog("Error updating tweet sentiment")
            #Only get tweets from Quito or Ecuador
            try:
               #writeLog(str(tweet))
	       if settings.locations != []:#check if the node is harvesting by coordinates
		       writeLog("Obtained by location Tweet")
		       if (str(tweet["place"]["name"]) == 'Quito' or str(tweet["place"]["name"]) == 'Ecuador' or str(tweet["place"]["name"]) == 'Pichincha'): 
		          doc = db.save(tweet) #Save tweet into CouchDB
		          print("Obtained by location Tweet ID: " + str(tweet['id']))
			  writeLog("Obtained by location Tweet from: "+str(tweet["place"]["name"])+"id: "+str(tweet['id'])+" text: "+str(tweet['text']))
		       else:
			  sys.stdout.write("Tweet Discarded due to wrong place mention place: "+str(tweet["place"]["name"])+ " Discarded Tweet ID: " + str(tweet['id']))
               else: #save if is harvesting by followers or by 
		       writeLog("Obtained by follower or tracks")
                       doc = db.save(tweet) #Save tweet into CouchDB
		       print("Obtained by follower or tracks Tweet ID: " + str(tweet['id']))
	               writeLog("Obtained by follower or tracks Tweet: "+str(tweet['id']))
			  
	    except Exception, err:
		 writeLog("Error: "+str(err)+" Tweet already stored: "+str(tweet['id']))
Ejemplo n.º 10
0
    for s in sents:
        words = nltk.word_tokenize(s)
        for w in self_reference_dict:
            if w in words:
                cnt += 1
                break
    self_reference_score.append(cnt / float(len(sents)))

# In[77]:

gender = []
for i, name in enumerate(d['host_name']):
    try:
        #         print nltk.word_tokenize(name)[0]
        gender.append(
            Genderizer.detect(firstName=nltk.word_tokenize(name)[0],
                              text=str(d['summary'][i])))
    except TypeError:
        gender.append('None')
    except ZeroDivisionError:
        gender.append(Genderizer.detect(firstName=nltk.word_tokenize(name)[0]))

# In[78]:

d = d.join(pd.DataFrame({'self_reference_score': self_reference_score}))
d = d.join(pd.DataFrame({'gender': gender}))

# In[79]:


def gender_to_int(g):
    if (g == 'male'):