Ejemplo n.º 1
0
def run_group():
    file_dir = "/home/pyongjoo/workspace/tweetsprocess/data/name-feature/"
    infile = file_dir + "screename-May10-AlmostVerified.csv"
    
    print infile
    
    csvreader = csv.reader(open(infile, 'rb'))

    conf_matrix = []
    for i in range(4):
        conf_matrix.append([0,0,0,0])
    
    for row in csvreader:
        screen_name = row[0]
        fullname = row[1]
        age = row[2]
    
        firstname = (fullname.split(' '))[0]
        age_group = babyname.ageToAgeGroup(age)
        
        group_prob = babyname.probHashInGroupForName(firstname)
        
        predicted_group = babyname.maxLikelyGroupForName(firstname)

        print (age_group, predicted_group,
               group_prob[0], group_prob[1], group_prob[2], group_prob[3])

        if predicted_group != -1:
            conf_matrix[age_group][predicted_group] += 1

    print "Confusion Matrix:"
    for i in range(len(conf_matrix)):
        for j in range(len(conf_matrix[0])):
            sys.stdout.write(str(conf_matrix[i][j]) + " ")
        print
Ejemplo n.º 2
0
def run_group():
    file_dir = "/home/pyongjoo/workspace/twitter-research/data/"
    infile = file_dir + "ageEmbededTweets-Jun19-sampled2.0.json"
    print infile

    document = json.loads(open(infile).read())

    conf_matrix = []
    for i in range(4):
        conf_matrix.append([0,0,0,0])

    libsvmoutfile = file_dir + "prob-Jun19.libsvm"
    libsvmout = open(libsvmoutfile, 'w')

    for tweetDoc in document:
        fullname = tweetDoc['user']['name']
        age = tweetDoc['user']['age']

        firstname = (fullname.split(' '))[0]
        firstname = firstname.encode('ascii', 'ignore')
        age_group = babyname.ageToAgeGroup(age)
        prob_hash = babyname.probHashInGroupForName(firstname)
        prob_array = [prob_hash[k] for k in sorted(prob_hash.keys())]
        predicted_group = (-1 if sum(prob_hash.values()) == 0
                else prob_array.index(max(prob_array)))

        if predicted_group != -1:
            libsvmout.write(str(age_group) + ' ')
            for group, prob in prob_hash.iteritems():
                libsvmout.write(str(group) + ':' + str(prob) + ' ')
            libsvmout.write('\n')

        # add to confusion matrix
        if predicted_group != -1:
            conf_matrix[age_group][predicted_group] += 1

    libsvmout.close()


    print "Confusion Matrix:"
    for i in range(len(conf_matrix)):
        for j in range(len(conf_matrix[0])):
            sys.stdout.write(str(conf_matrix[i][j]) + " ")
        print
    def getProbArrayFor(self, screen_name):
        try:
            firstname = self.screenNameToFirstName[screen_name]
            probHash = babyname.probHashInGroupForName(firstname)
        
            probArray = []

            for i in range(4):
                probArray.append(probHash[i])
            
            #if (probArray[0] == probArray[1] and
            #    probArray[1] == probArray[2] and
            #    probArray[2] == probArray[3]):
            #    probArray.append(1.0)
            
            return probArray
        
        except KeyError:
            return [0, 0, 0, 0]
Ejemplo n.º 4
0
def procedure2():
    '''
    TESTING THE FIRST NAME BY COMBINING NEIGHBOR NODES (prodecure2)

    1. Retrieve central nodes by selecting edges.this and removing duplicates.

    2. Get the age and name info of those guys by joining with users table and
    selecting appropriate column.

    3. Get name info of neighbors by joining with users table and selecting with
    `this` column for each central node. After this we can randomly select up to
    20 neighbors to reduce the computation issue.

    4. Run the first name system on both central nodes and their neighbors, and
    apply several custom aggregate functions.
        a. simple average of all the sums over the probabilities of every group.
        b. weighted average.
        c. majority vote.
    '''
    # Store pairs of name and age group for central nodes.
    # Most of the complicated operations are handled in db, and we only use
    # the name and the corresponding age info from the result set.
    centralNodes = []

    # Store the names of neighbors for every central nodes.
    # Key: user_id (of central node), Value: names of neighbors up to 20
    neighborNames = {}

    con = mdb.connect('localhost', 'yongjoo', 'Fgla4Zp0', 'yongjoo')

    with con:
        cur = con.cursor()

        # First thing to do is retrieving central nodes
        cur.execute('''SELECT DISTINCT e.this, u.age, u.name
            FROM edges e
            INNER JOIN users u
            ON e.this = u.user_id
            ''')
        numrows = int(cur.rowcount)
        for i in range(numrows):
            row = cur.fetchone()
            name = row[2]
            age = row[1]
            node_id = row[0]
            ageGroup = ageToAgeGroup(age)
            centralNodes.append([node_id, name, ageGroup])

        print "Finished collecting central nodes."


        def getNeighborsNames(node_id):
            '''Convenience method to retrieve an array of names of neighbors for
            the specific node passed as a parameter. TODO: may add an English
            filter later.'''

            # TODO: may use random sample instead of limiting. Limiting only
            # selects the first set of id's which are smallest id's in the set.
            cur.execute('''SELECT e.this, e.that, u.name, u.age
                FROM edges e
                INNER JOIN users u
                ON e.that = u.user_id
                WHERE this = %s
                LIMIT 80
                ''', (node_id))

            rowcount = int(cur.rowcount)
            rows = cur.fetchall()
            name_arr = [rows[i][2] for i in range(rowcount)]

            return name_arr
        # end of getNeighborsNames()


        # Secondly, we retrieve neighbors for every central nodes and save them
        # into the dict

        # in order to overcome to slow speed of generating the prob_array for all
        # the neighbors, it is made for us to choose a mode to operate. the normal
        # mode is when set to 'gen', but once the prob_array_dict is generated, we
        # can load the contents by reading a file later.
        prob_array_mode = 'gen'

        # we skip this step if we will load the probability array from a file.
        if prob_array_mode == 'gen':
            for triplet in centralNodes:
                node = triplet[0]
                neighbor_names = getNeighborsNames(node)
                neighborNames[node] = neighbor_names

            print "Finished collecting neighbor nodes."


    # end of using mysql connection


    # For the sake of prediction and testing, we gather probability array
    # for every central nodes. The data structure is like the following:
    #
    # { node0 : [prob_array, prob_array, ...]
    #   node1 : [prob_array, prob_array, ...]
    #   ... }
    #
    # The `prob_array` is an array of doubles of length 4; an example is
    # [0.2, 0.2, 0.3, 0.3]. The first prob_array for every array of
    # prob_arrays is the probabilities obained from the central node itself,
    # while following prob_arrays are obtained from neighbors. We may weight
    # differently between the prob_array from a central node and prob_arrays
    # from neighbors, but we do not differentiate between prob_arrays from
    # neighbors. i.e., they are all assigned the equal weights when
    # aggregating.
    prob_array_dict = {}


    if prob_array_mode  == 'gen':
        for [node_id, name, ageGroup] in centralNodes:
            # Init the slot
            prob_array_dict[node_id] = []

            # We create a list of names where the first element is my name and
            # the followings are neighbors' names.
            all_names = neighborNames[node_id]
            all_names.insert(0, name)
            for name in all_names:
                firstname = name.split(' ')[0]
                prob_hash = probHashInGroupForName(firstname)
                prob_array = [prob_hash[i] for i in range(4)]
                prob_array_dict[node_id].append(prob_array)

        print "Finished getting prob_arrays."

        probfile = 'prob_array_dict.json'
        probout = open(probfile, 'w')
        probout.write(json.dumps(prob_array_dict))
        probout.close()
        print "Wrote the prob_array_dict info to the file " + probfile

    else:
        probfile = 'prob_array_dict.json'
        probin = open(probfile)
        prob_array_dict_str = json.loads(probin.read())
        for key, value in prob_array_dict_str.iteritems():
            prob_array_dict[int(key)] = value
        probin.close()

        print "Read the prob_array_dict info from the file " + probfile


    # Probably the final step is to aggregate the prob_arrays collected for
    # each of central nodes. We will use the map built-in function to
    # aggregate the probabilities, and we need aggregation functions that
    # work on a list of prob_arrays and returns a predicted age group.

    # define more aggregator functions here.
    def average_aggregator(prob_array_list):
        def array_sum(x, y):
            return [x[i] + y[i] for i in range(len(x))]
        summed_array = reduce(array_sum, prob_array_list)
        return (summed_array.index(max(summed_array))
                if sum(summed_array) != 0 else -1)

    def local_aggregator(prob_array_list):
        local = prob_array_list[0]
        return (local.index(max(local))
                if sum(local) != 0 else [-1, local])

    def majority_vote_aggregator(prob_array_list):
        predicted = []
        for prob_array in prob_array_list:
            if sum(prob_array) != 0:
                predicted.append(prob_array.index(max(prob_array)))
        return (max(set(predicted), key = predicted.count)
                if len(predicted) != 0 else -1)

    def weighted_average_aggregator(prob_array_list):
        weight = 2.0
        def array_sum(x, y):
            return [x[i] + y[i] for i in range(len(x))]
        def weighted_array_sum(x, y, weight):
            return [x[i] * weight + y[i] for i in range(len(x))]
        summed_array = reduce(array_sum, prob_array_list)
        summed_array = weighted_array_sum(prob_array_list[0], summed_array, weight)
        return (summed_array.index(max(summed_array))
                if sum(summed_array) != 0 else -1)


    # using factory pattern to return a function that uses a specific
    # aggregator
    def reducer_factory(aggregator):
        def reducer(t):
            '''Key of t is node_id, and the value is the list of prob_array'''
            key = t[0]      # must be a node_id
            value = t[1]    # must be a list of prob_array
            aggregated = aggregator(value)
            return (key, aggregated)
        return reducer

    # predicted_dict has a following structure:
    # { node0: predicted age group,
    #   node1: predicted age group,
    #   ... }
    predicted_dict = dict(map(reducer_factory(average_aggregator),
        prob_array_dict.iteritems()))

    print "Finished generating predicted age groups."


    # Now validate the result against the true values contained in the
    # variable `centralNodes`

    # the number of cases where the db does not hold the first name.
    non_predictable_count = 0

    # confusion matrix
    confusion_mat = []
    for i in range(4):
        confusion_mat.append([0, 0, 0, 0])

    for [node_id, name, ageGroup] in centralNodes:
        firstname = name.split(' ')[0]
        predictGroup = predicted_dict[node_id]

        if predictGroup == -1:
            non_predictable_count += 1
        else:
            confusion_mat[ageGroup][predictGroup] += 1


    # Report the result

    # report the accuracy
    nu = sum([confusion_mat[i][i] for i in range(4)])
    denom = sum([confusion_mat[i][j] for i in range(4) for j in range(4)])
    accuracy = float(nu) / float(denom)
    real_accuracy = (float(nu + non_predictable_count * 0.25) /
            float(denom + non_predictable_count))
    print "Total examples: " + str(len(centralNodes))
    print "Accuracy: " + str(accuracy)
    print "Real accuracy: " + str(real_accuracy)

    # report the confusion matrix
    print "Confusion Matrix:"
    for i in range(4):
        for j in range(4):
            sys.stdout.write(str(confusion_mat[i][j]) + ' ')
        print
Ejemplo n.º 5
0
def run():

    # Set the output location, and open the data file.
    datadir = "/home/pyongjoo/workspace/twitter-research/data/"
    datafile = datadir + "features-Jun19-5n-small.nf"
    out = open(datafile, 'w')


    con = mdb.connect('localhost', 'yongjoo', 'Fgla4Zp0', 'yongjoo');

    with con:
        cur = con.cursor() 

        # Retrieves the central nodes by connecting to MySQL
        # In order to apply the English filter, we perform join with users
        # table. The way to apply the English filter is described in
        # `engFlag.py`.
        centralNodes = []

        cur.execute('''SELECT DISTINCT e.this FROM edges e
                       INNER JOIN users u
                       ON e.this = u.user_id
                       WHERE u.eng = 1''')
        numrows = int(cur.rowcount)

        for i in range(numrows):
            row = cur.fetchone()
            centralNodes.append(row[0])

        # perform random sampling to limit the size of the dataset.
        climit = 500
        if len(centralNodes) > climit:
            centralNodes = random.sample(centralNodes, climit)

        print "Finished collecting central nodes."


        # Based on the central nodes retrieved, we build a dictionary
        # representation of the star-shaped networks
        # We (randomly) sample at most 20 neighbors for each central users
        # The collected dictionary looks like:
        # {
        #   user_id: [...]
        #   user_id: [...]
        # }
        #
        # As above, we also perform join to apply the English filter.
        neighborDic = {}
        nlimit = 5

        for node in centralNodes:
            neighbors = []
            cur.execute('''SELECT e.this, e.that
                         FROM edges e
                         INNER JOIN users u
                         ON e.that = u.user_id
                         WHERE e.this = %s AND
                               u.eng = 1
                         LIMIT %s''',
                    (node, nlimit))
            numrows = int(cur.rowcount)

            for i in range(numrows):
                row = cur.fetchone()
                friend = row[1]
                neighbors.append(friend)

            neighborDic[node] = neighbors

        print "Finished collecting neighbor nodes."


        # We should do two things here.
        #
        # First is to convert the text (tweets) stored in mysql into feature
        # array and print them into the data file. Both of the observed nodes
        # and their friends (effectively hidden nodes) should be written to
        # the file.
        #
        # Second as we are writing the feature representation into the file,
        # we also record user_id to get the index (or order) that the node is
        # written. This is to construct the edges section of the data file later.
        # In the edges section, we need converted line order instead of the
        # user_id, the original identifier in the `neighborDic`.
        featureManager = FeatureManager()

        convertedNeighborDic = {}
        datalineNum = 0
        nodeIdToLineNum = {}        # nodeIdToLineNum[node_id] = line number
        nodeIdWritten = []          # record written node_id in order

        # first write central nodes
        out.write("#observed " + str(len(centralNodes)) + "\n")

        for node in centralNodes:
            # register for indexing and locate this node later
            convertedNeighborDic[datalineNum] = []
            nodeIdToLineNum[node] = datalineNum

            # get the text from mysql
            [text, age] = getTextAndAgeWithUserId(node)
            age_group = ageToAgeGroup(age)

            # output class
            out.write(str(age_group))

            # output labels and values
            farr = featureManager.convertTextIntoFeatureArray(text)
            for pair in farr:
                flabel = pair[0]
                fvalue = pair[1]
                out.write(" " + str(flabel) + ":" + str(fvalue))

            out.write('\n')
            datalineNum += 1
            nodeIdWritten.append(node)

        print "Finished writing central nodes."

        # write friends nodes after counting the total number of friends
        numFriends = 0
        for node in centralNodes:
            numFriends += len(neighborDic[node])

        out.write("#hidden " + str(numFriends) + "\n")

        for node in centralNodes:
            for friend in neighborDic[node]:

                # get the text from mysql
                [text, age] = getTextAndAgeWithUserId(friend)

                # output a dummy class
                out.write(str(0))

                # output labels and values
                farr = featureManager.convertTextIntoFeatureArray(text)
                for pair in farr:
                    flabel = pair[0]
                    fvalue = pair[1]
                    out.write(" " + str(flabel) + ":" + str(fvalue))

                out.write('\n')

                # register in the converted neighbor dictionary
                convertedNeighborDic[nodeIdToLineNum[node]].append(datalineNum)

                datalineNum += 1
                nodeIdWritten.append(friend)

        print "Finished writing friends node."

        # Next section is to write edges collected based on the line numbers
        # This is simply printing out the contents of `convertedNeighborDic`
        # in the following fashion:
        #
        #     [this node] [that node]
        #     ...
        #
        # Different from dictionary representation, we write the contents in
        # relational format to make for the C++ code process easily.
        out.write("#edges " + str(numFriends) + "\n")

        for node, friend_arr in convertedNeighborDic.iteritems():
            for friend in friend_arr:
                out.write(str(node) + " " + str(friend) + "\n")

        print "Finished writing edges."

        # From the first name (almost guessed, but with high probability) we
        # can restore the prior probability to some extent. We first retrieve
        # the first name from the database, and use the baby name statistics
        # to get the probability.
        out.write("#prior " + str(len(nodeIdWritten)) + "\n")

        for node in nodeIdWritten:
            cur.execute("SELECT name FROM users WHERE user_id = %s", (node))
            row = cur.fetchall()[0]
            firstname = row[0].split(' ')[0]
            prob_hash = probHashInGroupForName(firstname)

            for i in range(4):
                out.write(str(prob_hash[i]) + " ")

            out.write("\n")

        print "Finished writing name probability."

        ## end of `with con:` (no longer use database)


    # Close the data file to safely store the result.
    out.close()