Ejemplo n.º 1
0
 def cleanup(self):
     scream.say('Marking thread on ' + str(self.threadId) +
                ' as definitly finished..')
     self.finished = True
     scream.say('Terminating/join() thread on ' + str(self.threadId) +
                ' ...')
     self.my_browser.close()
Ejemplo n.º 2
0
 def __init__(self, threadId, batch):
     scream.say('Initiating GeneralGetter, running __init__ procedure.')
     self.threadId = threadId
     threading.Thread.__init__(self)
     urllib3.disable_warnings()
     self.daemon = True
     self.finished = False
     self.batch = batch
Ejemplo n.º 3
0
    def __init__(self, threadId, batch, fullname):
        say('Initiating GeneralGetter, running __init__ procedure.')
        self.threadId = threadId
        threading.Thread.__init__(self)

        # urllib 3 is a successor to urllib 2, it better handles concurency and it's thread safe
        urllib3.disable_warnings()
        self.daemon = True
        self.finished = False
        self.batch = batch
        self.fullname = fullname
def batch_update_database(connection, names, is_locked_tb, sample_tb_name):
    cursor = connection.cursor()
    for key in names.keys():
        collection = names[key]
        for fullname in names[key]['persons']:
            update_query = r'UPDATE {table} SET gender = {gender} , accuracy = {accuracy} where full_name = "{fullname}"'.format(
                gender=collection['classification'],
                fullname=fullname.encode('utf-8').replace('"', '\\"'),
                table='users_ext' if is_locked_tb else sample_tb_name,
                accuracy=collection['accuracy'])
            say(update_query)
            cursor.execute(update_query)
    cursor.close()
def update_single_record(connection, classification, is_locked_tb,
                         sample_tb_name):
    success = False
    while (not success):
        try:
            cursor = connection.cursor()
            fullname, accuracy, gender = classification
            update_query = r'UPDATE {table} t1 JOIN {sample_tb_name} t2 ON t1.id = t2.id SET t1.gender = {gender} , t1.accuracy = {accuracy} WHERE t2.name = "{fullname}"'.format(
                gender=gender,
                fullname=fullname.encode('utf-8').replace('"', '\\"'),
                table='users_ext' if is_locked_tb else sample_tb_name,
                accuracy=accuracy,
                sample_tb_name=sample_tb_name)
            say(update_query)
            cursor.execute(update_query)
            cursor.close()
            success = True
        except:
            say("Lost connection to MySQL? Update query failed")
            time.sleep(5)
    return
Ejemplo n.º 6
0
                        action="store_true")
    parser.add_argument("-limit",
                        "--small_test_chunk",
                        help="Limit records to 500? [True/False]",
                        action="store_true")
    parser.add_argument("-d",
                        "--default",
                        help="Lunch with defaults? [True/False]",
                        action="store_true")

    args = parser.parse_args()
    if args.silent:
        scream.disable_all = True
    if args.verbose:
        scream.intelliTag_verbose = True
        scream.say("verbosity turned on")

    if args.default:
        from sources.namsor import github_gender_finder
        github_gender_finder.execute_check(False, ("users", "yes"))
    elif args.interactive:
        print "Please select gender analyzer: "
        print "[1] - gender-api.com"
        print "[2] - genderchecker.com"
        print "[3] - NamSor Gender API"

        var = raw_input("[1-3]: ")

        if (str(var) == '1'):
            from sources.gender_api import github_gender_finder
            github_gender_finder.execute_check()
def execute_check(limit, run_defaults=None):
    threads = []

    # Initialize connection to database #open('mysqlu.dat', 'r').read(),
    connection = DatabaseFactory.init()
    definitely_say('Testing MySql connection...')
    cursor = DatabaseFactory.test_database(connection)
    DatabaseFactory.check_database_consistency(cursor)

    sample_tb_name = raw_input("Please enter table/view name (where to get users from): ") if run_defaults is None else run_defaults[0]
    record_count = DatabaseFactory.get_record_count(cursor, sample_tb_name, limit)
    cursor.close()

    definitely_say("Database seems to be working. Move on to getting list of users.")

    # populate list of users to memory
    cursor = connection.cursor()
    is_locked_tb = raw_input("Should I update [users_ext] table instead of [" + str(sample_tb_name) + "]? [y/n]: ") if run_defaults is None else run_defaults[1]
    is_locked_tb = True if is_locked_tb in ['yes', 'y'] else False
    definitely_say('Querying all names from the observations set.. This can take around 25-30 sec in LAN.')

    cursor.execute(r'select name, location from ' + str(sample_tb_name)
                   + ' where (type = "USR") and (fake <> 1) and (name rlike "[a-zA-Z]+( [a-zA-Z]+)?"){optional}'.format(optional=" limit 500" if limit else ""))
    # if you are interested in how this table was created, you will probably need to read our paper and contact us as well
    # because we have some more tables with aggregated data compared to standard GitHub Torrent collection
    row = cursor.fetchone()
    iterator = 1.0

    min_name_length = 2
    say('We hypothetize that minimum name length are '
        + str(min_name_length) + ' characters, like Ho, Sy, Lu')
    # http://www.answers.com/Q/What_is_the_shortest_name_in_the_world

    while row is not None:
        fullname, location = unicode(row[0]), unicode(row[1])
        log("\tFullname is: " + str(fullname.encode('unicode_escape')))
        iterator += 1
        say("[Progress]: " + str((iterator / record_count) * 100) + "% ----------- ")

        if len(fullname) < min_name_length:
            log_warning("--Found too short name field (" + str(fullname.encode('utf-8')) + ") from DB. Skipping..", True)
            row = cursor.fetchone()
            continue

        name, surname = StringUtil.split(fullname, na="encoded_space")
        country_code = LocationUtils.get_code(location)

        if name in names:
            if fullname in names[name]['persons']:
                say("\tSuch fullname already classified! Rare, but can happen. Move on.")
            else:
                say("\tAdding a new fullname to already classified name. Move on")
                names[name]['persons'].append(fullname)
                DatabaseFactory.update_record_threaded(connection, (fullname, names[name]['accuracy'], names[name]['classification']),
                                                       is_locked_tb, sample_tb_name)
        else:
            say("\tNew name. Lets start classification.")
            names[name] = {'persons': list(), 'classification': None, 'accuracy': None}
            names[name]['persons'].append(fullname)
            say("\t[Batch load] added new name: " + str(name.encode('utf-8')) + " as deriven from: " + str(fullname.encode('utf-8')))
            job = GeneralGetter(int(iterator), (name, surname, country_code), fullname)
            say('Creating instance of [GeneralGetter] complete')
            say('Appending thread to collection of threads')
            threads.append(job)
            say('Append complete, threads[] now have size: ' + str(len(threads)))
            log_debug('Starting thread ' + str(int(iterator)-1) + '....', True)
            job.start()
            while (num_working(threads) > 4):
                time.sleep(0.25)  # sleeping for 250 ms - there are already 4 active threads..
        row = cursor.fetchone()

    cursor.close()
    definitely_say("Finished getting gender data, waiting for processes to finish...")

    while (not all_finished(threads)):
        time.sleep(1.00)  # wait for all 4 threads to finish

    #DatabaseFactory.update_database(connection, names, is_locked_tb, sample_tb_name)

    for t in DatabaseFactory.threads:
        if t.isAlive():
            t.join()

    connection.close()
Ejemplo n.º 8
0
def execute_check():
    threads = []

    # Initialize connection to database #open('mysqlu.dat', 'r').read(),
    first_conn = MSQL.connect(
        host=IP_ADDRESS,
        port=3306,
        user=pkg_resources.resource_string('sources.gender_api', 'mysqlu.dat'),
        passwd=pkg_resources.resource_string('sources.gender_api',
                                             'mysqlp.dat'),
        db="github",
        connect_timeout=5 * 10**7,
        charset='utf8',
        init_command='SET NAMES UTF8',
        use_unicode=True)
    definitely_say('Testing MySql connection...')
    cursor = test_database(first_conn)
    check_database_consistency(cursor)

    sample_tb_name = raw_input(
        "Please enter table/view name (where to get users from): ")
    cursor.execute(
        r'select count(distinct name) from ' + str(sample_tb_name) +
        ' where (type = "USR") and (name rlike "[a-zA-Z]+( [a-zA-Z]+)?")')
    rows = cursor.fetchall()
    record_count = rows[0][0]
    cursor.close()

    definitely_say(
        "Database seems to be working. Move on to getting list of users.")

    # populate list of users to memory
    cursor = first_conn.cursor()
    is_locked_tb = raw_input("Should I update [users_ext] table instead of [" +
                             str(sample_tb_name) + "]? [y/n]: ")
    is_locked_tb = True if is_locked_tb in ['yes', 'y'] else False
    definitely_say(
        'Querying all names from the observations set.. This can take around 25-30 sec.'
    )

    cursor.execute(
        r'select distinct name from ' + str(sample_tb_name) +
        ' where (type = "USR") and (name rlike "[a-zA-Z]+( [a-zA-Z]+)?")')
    # if you are interested in how this table was created, you will probably need to read our paper and contact us as well
    # because we have some more tables with aggregated data compared to standard GitHub Torrent collection
    row = cursor.fetchone()
    iterator = 1.0

    min_name_length = 2
    say('We hypothetize that minimum name length are ' + str(min_name_length) +
        ' characters, like Ho, Sy, Lu')
    # http://www.answers.com/Q/What_is_the_shortest_name_in_the_world

    while row is not None:
        fullname = unicode(row[0])
        log("\tFullname is: " + str(fullname.encode('unicode_escape')))
        iterator += 1
        say("[Progress]: " + str((iterator / record_count) * 100) +
            "% ----------- ")
        if len(fullname) < min_name_length:
            log_warning(
                "--Found too short name field (" +
                str(fullname.encode('utf-8')) + ") from DB. Skipping..", True)
            row = cursor.fetchone()
            continue
        name = fullname.split()[0]
        # I find it quite uncommon to seperate name from surname with something else than a space
        # In some cultures first name comes after surname, but very often for the sake of westerners,
        # this is reversed-back (source: https://en.wikipedia.org/wiki/Surname#Order_of_names)
        log("\tName is: " + str(name.encode('unicode_escape')))
        if name in names:
            if fullname in names[name]['persons']:
                say("\tSuch fullname already classified! Rare, but can happen. Move on."
                    )
            else:
                say("\tAdding a new fullname to already classified name. Move on"
                    )
                names[name]['persons'].append(fullname)
        else:
            say("\tNew name. Lets start classification.")
            names[name] = {'persons': list(), 'classification': None}
            names[name]['persons'].append(fullname)
            say("\t[Batch load] added new name: " + str(name.encode('utf-8')) +
                " as deriven from: " + str(fullname.encode('utf-8')))
            # start the worker when stack is full
            jobLoad = GetterJobs.stackWith(int(iterator), name)
            if jobLoad is not None:
                say('Creating instance of [GeneralGetter] complete')
                say('Appending thread to collection of threads')
                threads.append(jobLoad)
                say('Append complete, threads[] now have size: ' +
                    str(len(threads)))
                log_debug('Starting thread ' + str(int(iterator) - 1) + '....',
                          True)
                jobLoad.start()
            while (num_working(threads) > 4):
                time.sleep(
                    0.2
                )  # sleeping for 200 ms - there are already 4 active threads..
        row = cursor.fetchone()

    cursor.close()
    definitely_say(
        "Finished getting gender data, moving to database update...")

    for key in names.keys():
        collection = names[key]
        for fullname in names[key]['persons']:
            cursor = first_conn.cursor()
            update_query = r'UPDATE {table} SET gender = {gender} , accuracy = {accuracy} where name = "{fullname}"'.format(
                gender=collection['classification'],
                fullname=fullname.encode('utf-8').replace('"', '\\"'),
                table='users' if is_locked_tb else sample_tb_name,
                accuracy=collection['accuracy'])
            say(update_query)
            cursor.execute(update_query)
            cursor.close()

    first_conn.close()
Ejemplo n.º 9
0
    def get_data(self, all_names):
        global names
        global MALE
        global FEMALE

        self.http = urllib3.PoolManager()

        scream.say('#Ask now the gender-api for names gender')
        self.oauth = get_random_auth()

        while True:
            try:
                self.adress = ur'https://gender-api.com/get?name={unpack_names}&key={oauth}'.format(
                    unpack_names=';'.join(
                        [StripNonAlpha(name) for name in all_names]),
                    oauth=self.oauth)
                self.r = self.http.request('GET', self.adress.encode('utf-8'))
                if self.r.data is None:
                    scream.say("No answer in http response body!")
                    time.sleep(60)
                    continue
                error_messages = ['errno', '30', 'errmsg', 'limit reached']
                if all(x in self.r.data for x in error_messages):
                    scream.say("Limit reached! Retry after a minute.")
                    scream.say(self.adress)
                    scream.say(self.r.data)
                    time.sleep(60)
                    continue
                break
            except urllib3.exceptions.ConnectionError:
                scream.definitely_say('Site gender-api.com seems to be down' +
                                      '. awaiting for 60s before retry')
                time.sleep(60)
            except Exception as exc:
                scream.definitely_say('Some other error: ')
                scream.definitely_say(str(exc))
                time.sleep(60)

        #scream.say('Response read. Parsing json.')
        #scream.say('--------------------------')
        #scream.say(str(self.r.data))
        #scream.say('--------------------------')

        self.result_json = json.loads(self.r.data)

        for idx, val in enumerate(self.result_json['result']):
            self.found_name = val['name']
            self.found_gender = val['gender']
            self.found_accuracy = val['accuracy']

            if self.found_gender.lower() == 'female':
                names[all_names[idx]]['classification'] = FEMALE
            else:
                names[all_names[idx]]['classification'] = MALE

            #scream.say('Response read. Parsing json.')
            #scream.say('+++++++++++++++++++++++++++++++++++')
            #scream.say(str(all_names[idx]) + ' ' + str(val['name']) + ' ' + str(val['gender']))
            #scream.say('+++++++++++++++++++++++++++++++++++')

            names[all_names[idx]]['accuracy'] = self.found_accuracy

        self.set_finished(True)
Ejemplo n.º 10
0
 def set_finished(self, finished):
     scream.say('Marking the thread ' + str(self.threadId) +
                ' as finished..')
     self.finished = finished
Ejemplo n.º 11
0
    def get_data(self, person_tuple, fullname):
        global names
        global MALE
        global FEMALE

        self.http = urllib3.PoolManager()

        say('#Ask now the namsor gender API for classification')

        name, surname, country_code = person_tuple

        self.network_attempts = 0
        while True:
            try:
                self.adress = ur'http://api.namsor.com/onomastics/api/json/gendre/{name}/{surname}/{country_code}'.format(
                    name=StringUtil.StripNonAlpha(name, False),
                    surname=StringUtil.StripNonAlpha(surname, True),
                    country_code=country_code
                    if country_code is not None else "")
                self.r = self.http.request('GET', self.adress.encode('utf-8'))
                self.network_attempts += 1
                if self.r.status >= 300:
                    say("Server response with HTTP code higher than or eq. 300, which means failure!"
                        )
                    time.sleep(30)
                    continue
                if self.r.data is None:
                    say("No answer in HTTP response body!")
                    time.sleep(60)
                    continue
                error_messages = [
                    'Apache Tomcat', '7.0.52', 'Error report', 'Status report'
                ]
                if all(x in self.r.data for x in error_messages):
                    say("HTTP error returned by WWW server. Try again, max 10 times."
                        )
                    say(self.adress)
                    say(self.r.data)
                    time.sleep(60)
                    if self.network_attempts < 10:
                        continue
                break
            except urllib3.exceptions.ConnectionError:
                definitely_say('Site api.namsor.com seems to be down' +
                               '. awaiting for 60s before retry')
                time.sleep(60)
            except Exception as exc:
                definitely_say('Some other error: ')
                definitely_say(str(exc))
                time.sleep(60)

        try:
            self.result_json = json.loads(self.r.data)
        except ValueError:
            self.set_finished(True)
            return

        self.found_gender = self.result_json["gender"]
        self.found_accuracy = int(float(self.result_json["scale"]) * 100)

        names[name]['classification'] = gender_object(self.found_gender)
        names[name]['accuracy'] = self.found_accuracy

        DatabaseFactory.update_record_threaded(
            DatabaseFactory.connection,
            (fullname, self.found_accuracy, gender_object(self.found_gender)))

        self.set_finished(True)
Ejemplo n.º 12
0
 def cleanup(self):
     say('Marking thread on ' + str(self.threadId) +
         ' as definitly finished..')
     self.finished = True
     say('Terminating/join() thread on ' + str(self.threadId) + ' ...')