def determine_gender_from_wiki(firstname, lastname=None): # print('trying to determine gender of '+str(firstname)+' '+str(lastname)) if lastname == None: return Genderizer.detect(firstName=firstname) try: p = wikipedia.page(firstname + lastname) #print p.title text_from_wikipedia = p.content if len(text_from_wikipedia) > 1000: text_from_wikipedia = text_from_wikipedia[0:1000] # print('text from wiki:'+text_from_wikipedia) return Genderizer.detect(firstName=firstname, text=text_from_wikipedia) except wikipedia.exceptions.PageError: return Genderizer.detect(firstName=firstname) except wikipedia.exceptions.DisambiguationError as e: print e.options first_option = e.options[0] print first_option try: p = wikipedia.page(first_option) print p.title text_from_wikipedia = p.content if len(text_from_wikipedia) > 1000: text_from_wikipedia = text_from_wikipedia[0:1000] # print('text from wiki:'+text_from_wikipedia) return Genderizer.detect(firstName=firstname, text=text_from_wikipedia) except wikipedia.exceptions.DisambiguationError as e: # this 2nd exception is because sometimes wiki thinks the query is ambiguous even i fyou choose the first option return Genderizer.detect(firstName=firstname)
def on_data(self, data): sys.stdout.write("loading data rom one tweet") writeLog("loading data rom one tweet") # Load Json from Twitter API tweet = json.loads(data) try: tweet["_id"] = str(tweet["id"]) # Get the ID lang = tweet["lang"] name = tweet["user"]["name"] # Gender Analysis: name_list = name.split() name = name_list[0] gender = Genderizer.detect(firstName=name) tweet["user"]["gender"] = gender # Sentiment Analysis analysed_result = classifier.doSentimentAnalysis(str(tweet["text"])) if str(lang) == "en": # only analyse english texts if not hasAlreadySentiment(tweet): tweet = updateSentimentDoc( tweet, analysed_result["sentiment"], analysed_result["polarity"], analysed_result["subjectivity"], ) self.processed_tweets += 1 else: self.ignored_tweets += 1 else: # otherwise ignore it! self.ignored_tweets += 1 # Update place coordinates to work with GeoJson tweet = updatePlaceDoc(tweet) tweet = updateCoordinate(tweet) # Update date fields for better reporting tweet = updateDateDay(tweet) # Update Sentiment tweet = updateSentiment(tweet) doc = db.save(tweet) # Save tweet into CouchDB print("Obtained Tweet ID: " + str(tweet["id"])) self.tweet_count += 1 if self.tweet_count % 10000 == 0: # Notify when 10000 new tweets have been stored on database msg_update = "10K new tweets on database: " + settings.database # emailer.sendEmail(message=str(msg_update)) except: sys.stdout.write("Twitter API error") writeLog("Twitter API error") pass return True
def on_data(self, data): #Load Json from Twitter API tweet = json.loads(data) try: tweet["_id"] = str(tweet['id']) #Get the ID lang = tweet['lang'] name = tweet['user']['name'] #Gender Analysis: name_list = name.split() name = name_list[0] gender = Genderizer.detect(firstName=name) tweet['user']['gender'] = gender #Sentiment Analysis analysed_result = classifier.doSentimentAnalysis(str( tweet['text'])) if str(lang) == 'en': #only analyse english texts if not hasAlreadySentiment(tweet): tweet = updateSentimentDoc(tweet, analysed_result["sentiment"], analysed_result["polarity"], analysed_result["subjectivity"]) self.processed_tweets += 1 else: self.ignored_tweets += 1 else: #otherwise ignore it! self.ignored_tweets += 1 #Update place coordinates to work with GeoJson tweet = updatePlaceDoc(tweet) doc = db.save(tweet) #Save tweet into CouchDB # print("Obtained Tweet ID: " + str(tweet['id'])) self.tweet_count += 1 if (self.tweet_count % 10000 == 0): #Notify when 10000 new tweets have been stored on database msg_update = '10K new tweets on database: ' + settings.database emailer.sendEmail(message=str(msg_update)) except: writeLog("Twitter API error") pass return True
def name_match(input): #remove the last non-alphabet try: input = re.search('^([^\w]+)?[a-zA-Z]{3,}([^\w]+)?$',\ input).group() input = re.sub('(^[^\w]+)|([^\w]+$)', '', input) except: return None #check if it is a none-name word if bool(re.search(re_none_name, \ input.lower())): return None #check if it is a title if bool(re.search(re_tile, \ input.lower())): return None input = input.title() try: output = d.get_gender(input) if output in [u'male', u'mostly_male']: return 'male' if output in [u'female', u'mostly_female']: return 'female' if output in [u'andy']: return 'unknown_gender' except: pass try: output = Genderizer.detect(firstName=input) if output in [u'male']: return 'male' if output in [u'female']: return 'female' except: pass return None
def identify_gender(name): gender = Genderizer.detect(firstName=name) return gender
client = MongoClient() db = client.tvshows collection = db.users tweets = db.tweets count = 0 for user_md in collection.find({"gender":{"$exists":False}},{"name":1, "description":1,"id_str":1}): name = user_md['name'] description = user_md['description'] id_str = user_md['id_str'] user_gender = '' user_gender = d.get_gender(name) if(user_gender == 'andy'): try: if(description != ''): user_gender = Genderizer.detect(firstName = name, text = description) except ZeroDivisionError: user_gender = 'None' if(user_gender == 'None'): for user_tweets in tweets.find({"user_id_str":id_str},{"text":1}).limit(10): twt = user_tweets['text'] try: user_gender = Genderizer.detect(firstName = name, text = twt) if(user_gender != 'None'): break except ZeroDivisionError: user_gender = 'None' if(user_gender == 'None'): list = [] list.append(name) response_list = Genderize().get(list)
collection2 = db2.pages_editors #cursor for searching inside the collection. Limit is used to get the record of first 20 editors in the list but to check it on whole db just remove limit. cursor2 = collection2.find().limit(22) cursor1 = collection1.find().limit(22) #loop for searching pages for doc2 in cursor2: #loop for searching editors for doc1 in cursor1: #this conditions detects that editor has editted which page if (doc1['_id']) in (doc2['editors_list']): print( '-------------------------------------------------------------' ) print('Page Title = ' + doc2['page_title']) print('\n') print(doc1['id']) print('\n') text = doc1['raw_content'].ljust(120)[:120].strip() print("gender by text") #Raw text given as an input to the function in genderizer library print(Genderizer.detect(text=text)) print('\n') else: break print("end")
import gender_guesser.detector as gender d = gender.Detector() from genderizer.genderizer import Genderizer as g # argument handling if len(sys.argv) is not 2: sys.exit('Please enter a school name! (Options: Penn, Brown, or Harvard)') school = sys.argv[1].lower() if school != 'penn' and school != 'harvard' and school != 'brown': sys.exit('Not a valid school. Try Penn, Brown, or Harvard!') with open('data/honorees_' + school + '_clean.csv', 'rb') as honorees, open('data/genders_' + school + '.csv', 'wb') as output: r = csv.reader(honorees) w = csv.writer(output) w.writerow(['Name', 'Year', 'Gender Guesser', 'Genderizer', 'Final']) for row in r: name = row[0] names = name.split(' ') gender = d.get_gender(names[0]) gender2 = g.detect(firstName=names[0]) w.writerow([ name, row[1], gender, 'unknown' if gender2 is None else gender2, gender if gender == gender2 else 'unknown' ])
def on_data(self, data): sys.stdout.write("loading data rom one tweet") writeLog("loading data rom one tweet") #Load Json from Twitter API tweet = json.loads(data) try: tweet["_id"] = str(tweet['id']) #Get the ID lang = tweet['lang'] name = tweet['user']['name'] #Gender Analysis: name_list = name.split() name = name_list[0] sys.stdout.write("Gender analysis success") writeLog("Gender analysis success") gender = Genderizer.detect(firstName = name) tweet['user']['gender'] = gender #Sentiment Analysis #analysed_result = classifier.doSentimentAnalysis(str(tweet['text'])) sys.stdout.write("sentiment analysis success") writeLog("sentiment analysis success") if str(lang) == 'es': #only analyse spanish texts if not hasAlreadySentiment(tweet): #tweet = updateSentimentDoc(tweet,analysed_result["sentiment"],analysed_result["polarity"],analysed_result["subjectivity"]) self.processed_tweets += 1 else: self.ignored_tweets += 1 else: #otherwise ignore it! self.ignored_tweets += 1 #Update place coordinates to work with GeoJson tweet = updatePlaceDoc(tweet) writeLog("place updated") tweet = updateCoordinate(tweet) writeLog("coordinates updated") #Update date fields for better reporting tweet = updateDateDay(tweet) writeLog("day updated") #Update Sentiment try: tweet = updateSentiment(tweet) writeLog("sentiment updated") except Exception, err: writeLog("Error updating tweet sentiment") #Only get tweets from Quito or Ecuador try: #writeLog(str(tweet)) if settings.locations != []:#check if the node is harvesting by coordinates writeLog("Obtained by location Tweet") if (str(tweet["place"]["name"]) == 'Quito' or str(tweet["place"]["name"]) == 'Ecuador' or str(tweet["place"]["name"]) == 'Pichincha'): doc = db.save(tweet) #Save tweet into CouchDB print("Obtained by location Tweet ID: " + str(tweet['id'])) writeLog("Obtained by location Tweet from: "+str(tweet["place"]["name"])+"id: "+str(tweet['id'])+" text: "+str(tweet['text'])) else: sys.stdout.write("Tweet Discarded due to wrong place mention place: "+str(tweet["place"]["name"])+ " Discarded Tweet ID: " + str(tweet['id'])) else: #save if is harvesting by followers or by writeLog("Obtained by follower or tracks") doc = db.save(tweet) #Save tweet into CouchDB print("Obtained by follower or tracks Tweet ID: " + str(tweet['id'])) writeLog("Obtained by follower or tracks Tweet: "+str(tweet['id'])) except Exception, err: writeLog("Error: "+str(err)+" Tweet already stored: "+str(tweet['id']))
for s in sents: words = nltk.word_tokenize(s) for w in self_reference_dict: if w in words: cnt += 1 break self_reference_score.append(cnt / float(len(sents))) # In[77]: gender = [] for i, name in enumerate(d['host_name']): try: # print nltk.word_tokenize(name)[0] gender.append( Genderizer.detect(firstName=nltk.word_tokenize(name)[0], text=str(d['summary'][i]))) except TypeError: gender.append('None') except ZeroDivisionError: gender.append(Genderizer.detect(firstName=nltk.word_tokenize(name)[0])) # In[78]: d = d.join(pd.DataFrame({'self_reference_score': self_reference_score})) d = d.join(pd.DataFrame({'gender': gender})) # In[79]: def gender_to_int(g): if (g == 'male'):