def cleanup(self): scream.say('Marking thread on ' + str(self.threadId) + ' as definitly finished..') self.finished = True scream.say('Terminating/join() thread on ' + str(self.threadId) + ' ...') self.my_browser.close()
def __init__(self, threadId, batch): scream.say('Initiating GeneralGetter, running __init__ procedure.') self.threadId = threadId threading.Thread.__init__(self) urllib3.disable_warnings() self.daemon = True self.finished = False self.batch = batch
def __init__(self, threadId, batch, fullname): say('Initiating GeneralGetter, running __init__ procedure.') self.threadId = threadId threading.Thread.__init__(self) # urllib 3 is a successor to urllib 2, it better handles concurency and it's thread safe urllib3.disable_warnings() self.daemon = True self.finished = False self.batch = batch self.fullname = fullname
def batch_update_database(connection, names, is_locked_tb, sample_tb_name): cursor = connection.cursor() for key in names.keys(): collection = names[key] for fullname in names[key]['persons']: update_query = r'UPDATE {table} SET gender = {gender} , accuracy = {accuracy} where full_name = "{fullname}"'.format( gender=collection['classification'], fullname=fullname.encode('utf-8').replace('"', '\\"'), table='users_ext' if is_locked_tb else sample_tb_name, accuracy=collection['accuracy']) say(update_query) cursor.execute(update_query) cursor.close()
def update_single_record(connection, classification, is_locked_tb, sample_tb_name): success = False while (not success): try: cursor = connection.cursor() fullname, accuracy, gender = classification update_query = r'UPDATE {table} t1 JOIN {sample_tb_name} t2 ON t1.id = t2.id SET t1.gender = {gender} , t1.accuracy = {accuracy} WHERE t2.name = "{fullname}"'.format( gender=gender, fullname=fullname.encode('utf-8').replace('"', '\\"'), table='users_ext' if is_locked_tb else sample_tb_name, accuracy=accuracy, sample_tb_name=sample_tb_name) say(update_query) cursor.execute(update_query) cursor.close() success = True except: say("Lost connection to MySQL? Update query failed") time.sleep(5) return
action="store_true") parser.add_argument("-limit", "--small_test_chunk", help="Limit records to 500? [True/False]", action="store_true") parser.add_argument("-d", "--default", help="Lunch with defaults? [True/False]", action="store_true") args = parser.parse_args() if args.silent: scream.disable_all = True if args.verbose: scream.intelliTag_verbose = True scream.say("verbosity turned on") if args.default: from sources.namsor import github_gender_finder github_gender_finder.execute_check(False, ("users", "yes")) elif args.interactive: print "Please select gender analyzer: " print "[1] - gender-api.com" print "[2] - genderchecker.com" print "[3] - NamSor Gender API" var = raw_input("[1-3]: ") if (str(var) == '1'): from sources.gender_api import github_gender_finder github_gender_finder.execute_check()
def execute_check(limit, run_defaults=None): threads = [] # Initialize connection to database #open('mysqlu.dat', 'r').read(), connection = DatabaseFactory.init() definitely_say('Testing MySql connection...') cursor = DatabaseFactory.test_database(connection) DatabaseFactory.check_database_consistency(cursor) sample_tb_name = raw_input("Please enter table/view name (where to get users from): ") if run_defaults is None else run_defaults[0] record_count = DatabaseFactory.get_record_count(cursor, sample_tb_name, limit) cursor.close() definitely_say("Database seems to be working. Move on to getting list of users.") # populate list of users to memory cursor = connection.cursor() is_locked_tb = raw_input("Should I update [users_ext] table instead of [" + str(sample_tb_name) + "]? [y/n]: ") if run_defaults is None else run_defaults[1] is_locked_tb = True if is_locked_tb in ['yes', 'y'] else False definitely_say('Querying all names from the observations set.. This can take around 25-30 sec in LAN.') cursor.execute(r'select name, location from ' + str(sample_tb_name) + ' where (type = "USR") and (fake <> 1) and (name rlike "[a-zA-Z]+( [a-zA-Z]+)?"){optional}'.format(optional=" limit 500" if limit else "")) # if you are interested in how this table was created, you will probably need to read our paper and contact us as well # because we have some more tables with aggregated data compared to standard GitHub Torrent collection row = cursor.fetchone() iterator = 1.0 min_name_length = 2 say('We hypothetize that minimum name length are ' + str(min_name_length) + ' characters, like Ho, Sy, Lu') # http://www.answers.com/Q/What_is_the_shortest_name_in_the_world while row is not None: fullname, location = unicode(row[0]), unicode(row[1]) log("\tFullname is: " + str(fullname.encode('unicode_escape'))) iterator += 1 say("[Progress]: " + str((iterator / record_count) * 100) + "% ----------- ") if len(fullname) < min_name_length: log_warning("--Found too short name field (" + str(fullname.encode('utf-8')) + ") from DB. Skipping..", True) row = cursor.fetchone() continue name, surname = StringUtil.split(fullname, na="encoded_space") country_code = LocationUtils.get_code(location) if name in names: if fullname in names[name]['persons']: say("\tSuch fullname already classified! Rare, but can happen. Move on.") else: say("\tAdding a new fullname to already classified name. Move on") names[name]['persons'].append(fullname) DatabaseFactory.update_record_threaded(connection, (fullname, names[name]['accuracy'], names[name]['classification']), is_locked_tb, sample_tb_name) else: say("\tNew name. Lets start classification.") names[name] = {'persons': list(), 'classification': None, 'accuracy': None} names[name]['persons'].append(fullname) say("\t[Batch load] added new name: " + str(name.encode('utf-8')) + " as deriven from: " + str(fullname.encode('utf-8'))) job = GeneralGetter(int(iterator), (name, surname, country_code), fullname) say('Creating instance of [GeneralGetter] complete') say('Appending thread to collection of threads') threads.append(job) say('Append complete, threads[] now have size: ' + str(len(threads))) log_debug('Starting thread ' + str(int(iterator)-1) + '....', True) job.start() while (num_working(threads) > 4): time.sleep(0.25) # sleeping for 250 ms - there are already 4 active threads.. row = cursor.fetchone() cursor.close() definitely_say("Finished getting gender data, waiting for processes to finish...") while (not all_finished(threads)): time.sleep(1.00) # wait for all 4 threads to finish #DatabaseFactory.update_database(connection, names, is_locked_tb, sample_tb_name) for t in DatabaseFactory.threads: if t.isAlive(): t.join() connection.close()
def execute_check(): threads = [] # Initialize connection to database #open('mysqlu.dat', 'r').read(), first_conn = MSQL.connect( host=IP_ADDRESS, port=3306, user=pkg_resources.resource_string('sources.gender_api', 'mysqlu.dat'), passwd=pkg_resources.resource_string('sources.gender_api', 'mysqlp.dat'), db="github", connect_timeout=5 * 10**7, charset='utf8', init_command='SET NAMES UTF8', use_unicode=True) definitely_say('Testing MySql connection...') cursor = test_database(first_conn) check_database_consistency(cursor) sample_tb_name = raw_input( "Please enter table/view name (where to get users from): ") cursor.execute( r'select count(distinct name) from ' + str(sample_tb_name) + ' where (type = "USR") and (name rlike "[a-zA-Z]+( [a-zA-Z]+)?")') rows = cursor.fetchall() record_count = rows[0][0] cursor.close() definitely_say( "Database seems to be working. Move on to getting list of users.") # populate list of users to memory cursor = first_conn.cursor() is_locked_tb = raw_input("Should I update [users_ext] table instead of [" + str(sample_tb_name) + "]? [y/n]: ") is_locked_tb = True if is_locked_tb in ['yes', 'y'] else False definitely_say( 'Querying all names from the observations set.. This can take around 25-30 sec.' ) cursor.execute( r'select distinct name from ' + str(sample_tb_name) + ' where (type = "USR") and (name rlike "[a-zA-Z]+( [a-zA-Z]+)?")') # if you are interested in how this table was created, you will probably need to read our paper and contact us as well # because we have some more tables with aggregated data compared to standard GitHub Torrent collection row = cursor.fetchone() iterator = 1.0 min_name_length = 2 say('We hypothetize that minimum name length are ' + str(min_name_length) + ' characters, like Ho, Sy, Lu') # http://www.answers.com/Q/What_is_the_shortest_name_in_the_world while row is not None: fullname = unicode(row[0]) log("\tFullname is: " + str(fullname.encode('unicode_escape'))) iterator += 1 say("[Progress]: " + str((iterator / record_count) * 100) + "% ----------- ") if len(fullname) < min_name_length: log_warning( "--Found too short name field (" + str(fullname.encode('utf-8')) + ") from DB. Skipping..", True) row = cursor.fetchone() continue name = fullname.split()[0] # I find it quite uncommon to seperate name from surname with something else than a space # In some cultures first name comes after surname, but very often for the sake of westerners, # this is reversed-back (source: https://en.wikipedia.org/wiki/Surname#Order_of_names) log("\tName is: " + str(name.encode('unicode_escape'))) if name in names: if fullname in names[name]['persons']: say("\tSuch fullname already classified! Rare, but can happen. Move on." ) else: say("\tAdding a new fullname to already classified name. Move on" ) names[name]['persons'].append(fullname) else: say("\tNew name. Lets start classification.") names[name] = {'persons': list(), 'classification': None} names[name]['persons'].append(fullname) say("\t[Batch load] added new name: " + str(name.encode('utf-8')) + " as deriven from: " + str(fullname.encode('utf-8'))) # start the worker when stack is full jobLoad = GetterJobs.stackWith(int(iterator), name) if jobLoad is not None: say('Creating instance of [GeneralGetter] complete') say('Appending thread to collection of threads') threads.append(jobLoad) say('Append complete, threads[] now have size: ' + str(len(threads))) log_debug('Starting thread ' + str(int(iterator) - 1) + '....', True) jobLoad.start() while (num_working(threads) > 4): time.sleep( 0.2 ) # sleeping for 200 ms - there are already 4 active threads.. row = cursor.fetchone() cursor.close() definitely_say( "Finished getting gender data, moving to database update...") for key in names.keys(): collection = names[key] for fullname in names[key]['persons']: cursor = first_conn.cursor() update_query = r'UPDATE {table} SET gender = {gender} , accuracy = {accuracy} where name = "{fullname}"'.format( gender=collection['classification'], fullname=fullname.encode('utf-8').replace('"', '\\"'), table='users' if is_locked_tb else sample_tb_name, accuracy=collection['accuracy']) say(update_query) cursor.execute(update_query) cursor.close() first_conn.close()
def get_data(self, all_names): global names global MALE global FEMALE self.http = urllib3.PoolManager() scream.say('#Ask now the gender-api for names gender') self.oauth = get_random_auth() while True: try: self.adress = ur'https://gender-api.com/get?name={unpack_names}&key={oauth}'.format( unpack_names=';'.join( [StripNonAlpha(name) for name in all_names]), oauth=self.oauth) self.r = self.http.request('GET', self.adress.encode('utf-8')) if self.r.data is None: scream.say("No answer in http response body!") time.sleep(60) continue error_messages = ['errno', '30', 'errmsg', 'limit reached'] if all(x in self.r.data for x in error_messages): scream.say("Limit reached! Retry after a minute.") scream.say(self.adress) scream.say(self.r.data) time.sleep(60) continue break except urllib3.exceptions.ConnectionError: scream.definitely_say('Site gender-api.com seems to be down' + '. awaiting for 60s before retry') time.sleep(60) except Exception as exc: scream.definitely_say('Some other error: ') scream.definitely_say(str(exc)) time.sleep(60) #scream.say('Response read. Parsing json.') #scream.say('--------------------------') #scream.say(str(self.r.data)) #scream.say('--------------------------') self.result_json = json.loads(self.r.data) for idx, val in enumerate(self.result_json['result']): self.found_name = val['name'] self.found_gender = val['gender'] self.found_accuracy = val['accuracy'] if self.found_gender.lower() == 'female': names[all_names[idx]]['classification'] = FEMALE else: names[all_names[idx]]['classification'] = MALE #scream.say('Response read. Parsing json.') #scream.say('+++++++++++++++++++++++++++++++++++') #scream.say(str(all_names[idx]) + ' ' + str(val['name']) + ' ' + str(val['gender'])) #scream.say('+++++++++++++++++++++++++++++++++++') names[all_names[idx]]['accuracy'] = self.found_accuracy self.set_finished(True)
def set_finished(self, finished): scream.say('Marking the thread ' + str(self.threadId) + ' as finished..') self.finished = finished
def get_data(self, person_tuple, fullname): global names global MALE global FEMALE self.http = urllib3.PoolManager() say('#Ask now the namsor gender API for classification') name, surname, country_code = person_tuple self.network_attempts = 0 while True: try: self.adress = ur'http://api.namsor.com/onomastics/api/json/gendre/{name}/{surname}/{country_code}'.format( name=StringUtil.StripNonAlpha(name, False), surname=StringUtil.StripNonAlpha(surname, True), country_code=country_code if country_code is not None else "") self.r = self.http.request('GET', self.adress.encode('utf-8')) self.network_attempts += 1 if self.r.status >= 300: say("Server response with HTTP code higher than or eq. 300, which means failure!" ) time.sleep(30) continue if self.r.data is None: say("No answer in HTTP response body!") time.sleep(60) continue error_messages = [ 'Apache Tomcat', '7.0.52', 'Error report', 'Status report' ] if all(x in self.r.data for x in error_messages): say("HTTP error returned by WWW server. Try again, max 10 times." ) say(self.adress) say(self.r.data) time.sleep(60) if self.network_attempts < 10: continue break except urllib3.exceptions.ConnectionError: definitely_say('Site api.namsor.com seems to be down' + '. awaiting for 60s before retry') time.sleep(60) except Exception as exc: definitely_say('Some other error: ') definitely_say(str(exc)) time.sleep(60) try: self.result_json = json.loads(self.r.data) except ValueError: self.set_finished(True) return self.found_gender = self.result_json["gender"] self.found_accuracy = int(float(self.result_json["scale"]) * 100) names[name]['classification'] = gender_object(self.found_gender) names[name]['accuracy'] = self.found_accuracy DatabaseFactory.update_record_threaded( DatabaseFactory.connection, (fullname, self.found_accuracy, gender_object(self.found_gender))) self.set_finished(True)
def cleanup(self): say('Marking thread on ' + str(self.threadId) + ' as definitly finished..') self.finished = True say('Terminating/join() thread on ' + str(self.threadId) + ' ...')