def reducer2(self, key, value):
        mass = 0.
        num_nodes = 0

        #conn = boto.connect_s3()
        aws_access_key_id = get_jobconf_value('aws_access_key_id')
        aws_secret_access_key = get_jobconf_value('aws_secret_access_key')
        conn = S3Connection(aws_access_key_id, aws_secret_access_key)

        mybucket = conn.get_bucket('ucb-mids-mls-juanjocarin')
        k = Key(mybucket)
        mykey = 'num_mass{}'.format(self.iteration)
        k.key = mykey
        data = k.get_contents_as_string()
        data = data.strip().split('\t')
        num_nodes = num_nodes + int(data[0])
        mass = mass + float(data[1])
        for v in value:
            ## 1) Its adjacency list
            sinks = v[0]
            ## 2) The corrected PR
            ## PR = alpha * PR + alpha * m / |G| + (1-alpha) * (1/|G|)
            PR = self.alpha * (v[1] + mass / num_nodes)
            PR = PR + (1 - self.alpha) / num_nodes
        yield key, [sinks, PR]
Beispiel #2
0
 def mapper_dangling(self, key, value):
     # Topic of Current Node
     topic = get_jobconf_value('topic')
     # Number of Nodes in same Topic as current Node
     n_nodes_topic = self.topicCounts.get(topic, 0)
     
     #sys.stderr.write('[M_D] {0}, {1}, {2} \n'.format(key, topic, n_nodes_topic)) 
     
     i = int(get_jobconf_value('iteration'))
     key = key.replace("\"","")
     key = key.replace("\\","")
     adj_list = ast.literal_eval(str(value))
     
     nodes = int(get_jobconf_value('nodes'))
     teleportation = float(get_jobconf_value('teleportation'))
     topic_bias = float(get_jobconf_value('topic_bias'))
     
     score = adj_list['score']
     
     '''
         Adjust for Topic Bias
         Random Surfer selects Nodes in same Topic as current node using a Topic Bias (> 0.5: Topic Sensitive)
     '''
     if topic != '0':
         random_topic_jump = teleportation * ((topic_bias/n_nodes_topic) + ((1 - topic_bias)/ (nodes - n_nodes_topic)))
         modified_score = random_topic_jump + (1 - teleportation) * ((self.dangling_mass / nodes) + score)
     else:
         modified_score = (teleportation / nodes) + (1 - teleportation) * ((self.dangling_mass / nodes) + score)
     
     #modified_score = (teleportation / nodes) + (1 - teleportation) * ((self.dangling_mass / nodes) + score)
     #modified_score = sum_log((teleportation / nodes), (1 - teleportation)*(self.dangling_mass / nodes))
     #modified_score = sum_log(modified_score, (1 - teleportation)*score)
     adj_list['score'] = modified_score
         
     yield key, adj_list
    def mapper_MulMat(self, key, value):
        if 'GModMat' == key:

            tmpRow = row = value[0]
            tmpCol = col = value[1]
            val = value[2]

            GroupID_row, tmp = tmpRow.split("_")
            GroupID_col, tmp = tmpCol.split("_")

            assert GroupID_row == GroupID_col, "GroupID_row and GroupID_col must be same"

            if row == col and 1 == self.options.iteration:
                maxColumnSum = float(get_jobconf_value("MAXSUM_" + GroupID_row))
                val = val + maxColumnSum

            #yield col, ('A', row, val)   # For A matrix
            #yield row, ('B', col, val)   # For B matrix

            #yield col + "|" + 'A' + "|" + row, val   # For A matrix
            #yield row + "|" + 'B' + "|" + col, val   # For B matrix

            matSize = int(get_jobconf_value("matSize_" + GroupID_row))

            # For A
            for i in range(matSize):
                midKey = get_jobconf_value("matIdx_" + GroupID_row + "_" + str(i))      # matIdx_0_0=0_20   ...

                yield row + "|" + midKey + "|" + col, val   # row, i, col
                yield midKey + "|" + col + "|" + row, val   # i, col, row
Beispiel #4
0
 def mapper(self, key, value):
     nodes = int(get_jobconf_value('nodes'))
     i = int(get_jobconf_value('iteration'))
     #sys.stderr.write('[M] {0}, {1} \n'.format(key, value))
     key = key.replace("\"","")
     key = key.replace("\\","")
     adj_list = ast.literal_eval(value)
   
     score = 0
     l = 0
     
     if 'score' in adj_list.keys():
         # Previous Mass/Page Rank
         score = adj_list['score']
         l = len(adj_list) - 1
     else: # First iteration ('score' not yet part of the adjacency list!)
         # Start with uniform probability distribution
         score = 1.0 / nodes
         l = len(adj_list)
         adj_list['score'] = score
         
     if l == 0: # Only 'score' & no out links [Dangling!]
         sys.stderr.write('[{0}][M] "DANGLING MASS" | {1} | {2}\n'.format(i, key, score))
         # Emit using a special key; Accumlate in Reducer;Distribute in the next MRJob
         yield 'DANGLING', ('SCORE', score)
    
     # Emit the Graph Structure
     yield key, ('GRAPH', adj_list)
                 
     # Emit the new Mass/Page Rank
     for n in adj_list:
         if n != 'score':
             yield n, ('SCORE', score/l)
    def mapper(self, key, value):
        nodes = int(get_jobconf_value('nodes'))
        i = int(get_jobconf_value('iteration'))
        #sys.stderr.write('[M] {0}, {1} \n'.format(key, value))
        key = key.replace("\"", "")
        key = key.replace("\\", "")
        adj_list = ast.literal_eval(value)

        score = 0
        l = 0

        if 'score' in adj_list.keys():
            # Previous Mass/Page Rank
            score = adj_list['score']
            l = len(adj_list) - 1
        else:  # First iteration ('score' not yet part of the adjacency list!)
            # Start with uniform probability distribution
            score = 1.0 / nodes
            l = len(adj_list)
            adj_list['score'] = score

        if l == 0:  # Only 'score' & no out links [Dangling!]
            sys.stderr.write('[{0}][M] "DANGLING MASS" | {1} | {2}\n'.format(
                i, key, score))
            # Emit using a special key; Accumlate in Reducer;Distribute in the next MRJob
            yield 'DANGLING', ('SCORE', score)

        # Emit the Graph Structure
        yield key, ('GRAPH', adj_list)

        # Emit the new Mass/Page Rank
        for n in adj_list:
            if n != 'score':
                yield n, ('SCORE', score / l)
Beispiel #6
0
    def reducer(self, key, values):     
        i = int(get_jobconf_value('iteration'))
        teleportation = float(get_jobconf_value('teleportation'))
        nodes = int(get_jobconf_value('nodes'))
        
        adj_list = None
        total_score = 0

        for value_type, value in values:
            if value_type == 'GRAPH':
                adj_list = value
            else:
                assert value_type == 'SCORE'
                total_score += value
                #total_score = sum_log(total_score, value)
                
        # Special Key
        if key == 'DANGLING':
            # Write accumulated Dangling Score in a file
            with open('/Users/ssatpati/0-DATASCIENCE/DEV/github/ml/w261/wk9/dangling.txt', 'w') as f:
                f.write('DANGLING\t{0}\n'.format(total_score))
        else:
            #total_score = (teleportation / nodes) + ((1 - teleportation) * total_score)
            #total_score = sum_log((teleportation / nodes), ((1 - teleportation) * total_score))
            if adj_list:
                adj_list['score'] = total_score
            else:
                adj_list = {'score': total_score}
    
            #sys.stderr.write('[R2] {0} | {1} | {2}\n\n'.format(key, total_score, adj_list))
            yield key, adj_list
    def mapper_CalcPij(self, key, value):
        """
        Bij = Aij - ( Ki * Kj ) / M
            = ( ( Aij * M ) - ( Ki * Kj ) ) 
              -----------------------------
                            M
        """
        
        #self.startDate = get_jobconf_value("maxNodeID")
        if key.isdigit():
            """
            Input:
                "1"     "2"
                "2"     "1"
                "3"     "4"
                "3"     "5"
                "4"     "3"
                "4"     "5"
                "5"     "3"
                "5"     "4"
            """
            row = key
            col = value

            Aij = 1.0
            yield row + "_" + col, ('A', Aij)

        elif 'k' == key:
            """
                "k"     ["7", 4]
            """

            row = value[0]
            minNodeID = int(get_jobconf_value("minNodeID"))
            maxNodeID = int(get_jobconf_value("maxNodeID"))
            k_row = float(get_jobconf_value("k" + row))
            M     = float(get_jobconf_value("M"))

            for col in range(minNodeID, maxNodeID + 1):
                if 'local' == RUN_TYPE:
                    #sys.stderr.write("col %d \t |" % col)
                    pass

                k_col = float(get_jobconf_value("k" + str(col)))
                Pij = (k_row * k_col) / M
                yield row + "_" + str(col), ('P', Pij)

        elif 'm' == key:
            return
        elif 'max' == key:
            return
        elif 'min' == key:
            return
        elif 'x' == key:
            return
        else:
            assert False, "In genModularityMat_mapper(), wrong key type. key : %s" % key
 def mapper(self, _, line):
     v = line.split(',')
     n = (len(v)-2)/2 #number of Non-zero columns for this each
     i = int(get_jobconf_value("row.num.A")) # we need to know how many rows of A
     j = int(get_jobconf_value("col.num.B")) # we need to know how many columns of B
     
     if v[0]=='0':
         for p in range(n):
             for q in range(j):
                 yield (int(v[1]),q), (int(v[p*2+2]),float(v[p*2+3]))
         
     elif v[0]=='1':
         for p in range(n):
             for q in range(i):
                 yield (q,int(v[p*2+2])), (int(v[1]),float(v[p*2+3]))
    def reducer1_final(self):
        num_nodes = len(self.nodes.keys())
        #conn = boto.connect_s3()

        aws_access_key_id = get_jobconf_value('aws_access_key_id')
        aws_secret_access_key = get_jobconf_value('aws_secret_access_key')
        conn = S3Connection(aws_access_key_id, aws_secret_access_key)
        mybucket = conn.get_bucket('ucb-mids-mls-juanjocarin')
        k = Key(mybucket)
        mykey = 'num_mass{}'.format(self.iteration)
        k.key = mykey
        k.set_contents_from_string(
            str(num_nodes) + '\t' + str(self.dangling_nodes_mass))
        for k, v in self.nodes.iteritems():
            yield k, [v[0], v[1]]
Beispiel #10
0
 def mapper_init(self):
     self.frontier_node = get_jobconf_value('frontier_node')
     if not self.frontier_node:
         # Save a list of visited nodes
         self.visited = [s.strip() for s in 
                           open('visited.txt').readlines()]
         open('visited.txt', 'w').close()
 def parsemat(self):
     """ Return 1 if this is the A matrix, otherwise return 2"""
     fn = get_jobconf_value('map.input.file')
     if self.options.Amatname in fn: 
         return 1
     else:
         return 2
 def parsemat(self):
     """ Return 1 if this is the A matrix, otherwise return 2"""
     fn = get_jobconf_value('map.input.file')
     if self.options.Amatname in fn:
         return 1
     else:
         return 2
 def mapper(self, _, line):
     # step 0: strip off unexpected characters
     line = line.split('\t')[1]
     
     # step 1: fetch the exodus file from Hadoop cluster
     file = os.path.basename(line)
     if os.path.isfile(os.path.join('./', file)):
         call(['rm', os.path.join('./', file)])
     check_call(['hadoop', 'fs', '-copyToLocal', line, os.path.join('./', file)])
     outdir = os.path.basename(line)
     ind = outdir.rfind('.')
     outdir = outdir[0:ind]
     if os.path.isdir(os.path.join('./', outdir)):
         call(['rm', '-r', os.path.join('./', outdir)])
     call(['mkdir', os.path.join('./', outdir)])
     
     # step 2: do our local processing
     result = convert(os.path.join('./', file), self.timesteps, os.path.join('./', outdir), self.variables)
     
     # step3: write back to Hadoop cluster
     user = get_jobconf_value('mapreduce.job.user.name')
    
     for fname in os.listdir(os.path.join('./', outdir)):
         if call(['hadoop', 'fs', '-test', '-e', os.path.join(self.outdir,outdir,fname)]) == 0:
             call(['hadoop', 'fs', '-rm', os.path.join(self.outdir,outdir,fname)])
         call(['hadoop', 'fs', '-copyFromLocal', os.path.join('./',outdir,fname),os.path.join(self.outdir,outdir,fname)])
         call(['hadoop', 'fs', '-chown', '-R', user, os.path.join(self.outdir)])
     call(['rm', os.path.join('./', file)])
     call(['rm', '-r', os.path.join('./', outdir)])
     
     #step 4: yield output key/value
     if result == True:
         yield (line, 0)
     else:
         yield (line, 1)
    def mapper_CalcPij(self, key, value):
        """
        Bij = Aij - ( Ki * Kj ) / M
            = ( ( Aij * M ) - ( Ki * Kj ) ) 
              -----------------------------
                            M
        """
        
        if key.isdigit():
            """
            Input:
                "1"     "2"
                "2"     "1"
                "3"     "4"
                "3"     "5"
                "4"     "3"
                "4"     "5"
                "5"     "3"
                "5"     "4"
            """
            row = key
            col = value

            Aij = 1.0
            yield row + "_" + col + "_" + "A",  (Aij)

        elif 'k' == key:
            """
                "k"     ["7", 4]
            """

            row = value[0]
            k_row = float(get_jobconf_value("k" + row))

            for col in range(self.options.minNodeID, self.options.maxNodeID + 1):
                if 'local' == RUN_TYPE:
                    #sys.stderr.write("col %d \t |" % col)
                    pass

                k_col = float(get_jobconf_value("k" + str(col)))
                Pij = (k_row * k_col) / self.options.M
                yield row + "_" + str(col) + "_" + "P", (Pij)

        elif key in ['m', 'max', 'min', 'x']:
            return
        else:
            assert False, "In genModularityMat_mapper(), wrong key type. key : %s" % key
    def mapper(self, _, line):
        # step 0: strip off unexpected characters
        line = line.split('\t')[1]

        # step 1: fetch the exodus file from Hadoop cluster
        file = os.path.basename(line)
        if os.path.isfile(os.path.join('./', file)):
            call(['rm', os.path.join('./', file)])
        check_call(
            ['hadoop', 'fs', '-copyToLocal', line,
             os.path.join('./', file)])
        outdir = os.path.basename(line)
        ind = outdir.rfind('.')
        outdir = outdir[0:ind]
        if os.path.isdir(os.path.join('./', outdir)):
            call(['rm', '-r', os.path.join('./', outdir)])
        call(['mkdir', os.path.join('./', outdir)])

        # step 2: do our local processing
        if self.timestepfile is None:
            lines = None
        else:
            f = open(os.path.basename(self.timestepfile))
            lines = f.readlines()
            for i in xrange(0, len(lines)):
                lines[i] = float(lines[i].strip())

        result = convert(os.path.join('./', file), self.timesteps,
                         os.path.join('./', outdir), self.variables, lines)

        # step3: write back to Hadoop cluster
        user = get_jobconf_value('mapreduce.job.user.name')

        for fname in os.listdir(os.path.join('./', outdir)):
            if call([
                    'hadoop', 'fs', '-test', '-e',
                    os.path.join(self.outdir, outdir, fname)
            ]) == 0:
                call([
                    'hadoop', 'fs', '-rm',
                    os.path.join(self.outdir, outdir, fname)
                ])
            call([
                'hadoop', 'fs', '-copyFromLocal',
                os.path.join('./', outdir, fname),
                os.path.join(self.outdir, outdir, fname)
            ])
            call([
                'hadoop', 'fs', '-chown', '-R', user,
                os.path.join(self.outdir)
            ])
        call(['rm', os.path.join('./', file)])
        call(['rm', '-r', os.path.join('./', outdir)])

        #step 4: yield output key/value
        if result == True:
            yield (line, 0)
        else:
            yield (line, 1)
Beispiel #16
0
 def mapper_dangling(self, key, value):
     #sys.stderr.write('[M_D] {0}, {1} \n'.format(key, value))
     i = int(get_jobconf_value('iteration'))
     key = key.replace("\"","")
     key = key.replace("\\","")
     adj_list = ast.literal_eval(str(value))
     
     if self.dangling_mass > 0:
         nodes = int(get_jobconf_value('nodes'))
         teleportation = float(get_jobconf_value('teleportation'))
         score = adj_list['score']
         modified_score = (teleportation / nodes) + (1 - teleportation) * ((self.dangling_mass / nodes) + score)
         #modified_score = sum_log((teleportation / nodes), (1 - teleportation)*(self.dangling_mass / nodes))
         #modified_score = sum_log(modified_score, (1 - teleportation)*score)
         adj_list['score'] = modified_score
         
     yield key, adj_list
 def mapper(self, key, value):
     nodes = int(get_jobconf_value('nodes'))
     dangling_mass = float(get_jobconf_value('dangling_mass'))
     teleportation = float(get_jobconf_value('teleportation'))
     #sys.stderr.write('[M] {0}, {1} \n'.format(key, value))
     key = key.replace("\"","")
     key = key.replace("\\","")
     neighbors = ast.literal_eval(value)
     
     score = float(neighbors['score'])
     
     modified_score = teleportation / nodes + (1 - teleportation) * ( (dangling_mass / nodes) + score)
     
     print '{0}, {1}, {2}'.format(score, modified_score, dangling_mass)
     
     neighbors['score'] = modified_score
     
     yield key, neighbors
    def mapper_dangling(self, key, value):
        #sys.stderr.write('[M_D] {0}, {1} \n'.format(key, value))
        i = int(get_jobconf_value('iteration'))
        key = key.replace("\"", "")
        key = key.replace("\\", "")
        adj_list = ast.literal_eval(str(value))

        if self.dangling_mass > 0:
            nodes = int(get_jobconf_value('nodes'))
            teleportation = float(get_jobconf_value('teleportation'))
            score = adj_list['score']
            modified_score = (teleportation / nodes) + (1 - teleportation) * (
                (self.dangling_mass / nodes) + score)
            #modified_score = sum_log((teleportation / nodes), (1 - teleportation)*(self.dangling_mass / nodes))
            #modified_score = sum_log(modified_score, (1 - teleportation)*score)
            adj_list['score'] = modified_score

        yield key, adj_list
	def mapper(self, _, l):
		t = l.strip('\n').split('\t')
		text = t[1]
		i = int(t[0])
		n = int(get_jobconf_value("total"))
		for j in range(1, i):
			yield(("%d,%d" % (i, j)), text)
		for j in range(i + 1, n + 1):
			yield(("%d,%d" % (j, i)), text)
 def mapper_dangling_init(self):
     i = int(get_jobconf_value('iteration'))
     aws_access_key_id = get_jobconf_value('aws_access_key_id')
     aws_secret_access_key = get_jobconf_value('aws_secret_access_key')
     
     self.dangling_mass = 0
     
     # Read Dangling Mass from S3 Bucket
     try:
         conn = boto.connect_s3()
         bucket = conn.get_bucket('ucb-mids-mls-juanjocarin')
         k = Key(bucket)
         k.key = 'hw93/dangling_mass/{0}'.format(i) # Same as iteration
         self.dangling_mass = float(k.get_contents_as_string())
     except boto.exception.S3ResponseError as err:
         sys.stderr.write(err)
         sys.exit(1)
     
     sys.stderr.write('[{0}][M_D] DANGLING MASS: {1}\n'.format(i, self.dangling_mass))
    def mapper(self, _, line):
        v = line.split(',')
        n = (len(v) - 2) / 2  #number of Non-zero columns for this each
        i = int(get_jobconf_value(
            "row.num.A"))  # we need to know how many rows of A
        j = int(get_jobconf_value(
            "col.num.B"))  # we need to know how many columns of B

        if v[0] == '0':
            for p in range(n):
                for q in range(j):
                    yield (int(v[1]), q), (int(v[p * 2 + 2]),
                                           float(v[p * 2 + 3]))

        elif v[0] == '1':
            for p in range(n):
                for q in range(i):
                    yield (q, int(v[p * 2 + 2])), (int(v[1]),
                                                   float(v[p * 2 + 3]))
    def mapper_dangling_init(self):
        i = int(get_jobconf_value('iteration'))
        aws_access_key_id = get_jobconf_value('aws_access_key_id')
        aws_secret_access_key = get_jobconf_value('aws_secret_access_key')

        self.dangling_mass = 0

        # Read Dangling Mass from S3 Bucket
        try:
            conn = boto.connect_s3()
            bucket = conn.get_bucket('ucb-mids-mls-juanjocarin')
            k = Key(bucket)
            k.key = 'hw93/dangling_mass/{0}'.format(i)  # Same as iteration
            self.dangling_mass = float(k.get_contents_as_string())
        except boto.exception.S3ResponseError as err:
            sys.stderr.write(err)
            sys.exit(1)

        sys.stderr.write('[{0}][M_D] DANGLING MASS: {1}\n'.format(
            i, self.dangling_mass))
    def reducer(self, key, values):
        i = int(get_jobconf_value('iteration'))
        teleportation = float(get_jobconf_value('teleportation'))
        nodes = int(get_jobconf_value('nodes'))
        aws_access_key_id = get_jobconf_value('aws_access_key_id')
        aws_secret_access_key = get_jobconf_value('aws_secret_access_key')

        adj_list = None
        total_score = 0

        for value_type, value in values:
            if value_type == 'GRAPH':
                adj_list = value
            else:
                assert value_type == 'SCORE'
                total_score += value
                #total_score = sum_log(total_score, value)

        # Write Special Key to S3
        if key == 'DANGLING':
            # Write accumulated Dangling Score in a S3 Key
            try:
                conn = boto.connect_s3()
                bucket = conn.get_bucket('ucb-mids-mls-juanjocarin')
                k = Key(bucket)
                k.key = 'hw93/dangling_mass/{0}'.format(i)  # Same as iteration
                k.set_contents_from_string(str(total_score))
            except boto.exception.S3ResponseError as err:
                sys.stderr.write(err)
                sys.exit(1)
        else:
            #total_score = (teleportation / nodes) + ((1 - teleportation) * total_score)
            #total_score = sum_log((teleportation / nodes), ((1 - teleportation) * total_score))
            if adj_list:
                adj_list['score'] = total_score
            else:
                adj_list = {'score': total_score}

            #sys.stderr.write('[R2] {0} | {1} | {2}\n\n'.format(key, total_score, adj_list))
            yield key, adj_list
    def reducer(self, key, values):     
        i = int(get_jobconf_value('iteration'))
        teleportation = float(get_jobconf_value('teleportation'))
        nodes = int(get_jobconf_value('nodes'))
        aws_access_key_id = get_jobconf_value('aws_access_key_id')
        aws_secret_access_key = get_jobconf_value('aws_secret_access_key')
        
        adj_list = None
        total_score = 0

        for value_type, value in values:
            if value_type == 'GRAPH':
                adj_list = value
            else:
                assert value_type == 'SCORE'
                total_score += value
                #total_score = sum_log(total_score, value)
                
        # Write Special Key to S3
        if key == 'DANGLING':
            # Write accumulated Dangling Score in a S3 Key
            try:
                conn = boto.connect_s3()
                bucket = conn.get_bucket('ucb-mids-mls-juanjocarin')
                k = Key(bucket)
                k.key = 'hw93/dangling_mass/{0}'.format(i) # Same as iteration
                k.set_contents_from_string(str(total_score))
            except boto.exception.S3ResponseError as err:
                sys.stderr.write(err)
                sys.exit(1)
        else:
            #total_score = (teleportation / nodes) + ((1 - teleportation) * total_score)
            #total_score = sum_log((teleportation / nodes), ((1 - teleportation) * total_score))
            if adj_list:
                adj_list['score'] = total_score
            else:
                adj_list = {'score': total_score}
    
            #sys.stderr.write('[R2] {0} | {1} | {2}\n\n'.format(key, total_score, adj_list))
            yield key, adj_list
Beispiel #25
0
 def mapper_dangling_init(self):
     i = int(get_jobconf_value('iteration'))
     self.dangling_mass = 0
     f_dangling = '/Users/ssatpati/0-DATASCIENCE/DEV/github/ml/w261/wk9/dangling.txt'
     try:
         with open(f_dangling, 'r') as f:
             l = f.readlines()
             if l:
                 self.dangling_mass = float(l[0].split('\t')[1])
         open(f_dangling, 'w').close()
     except Exception as e:
         pass
     sys.stderr.write('[{0}][M_D] DANGLING MASS: {1}\n'.format(i, self.dangling_mass))
    def mapper(self, _, line):
        dist_type = get_jobconf_value('dist_type')
        tokens = line.strip().split('\t')

        key = tokens[0].replace("\"", "")
        dict_pairs = ast.literal_eval(tokens[1])

        for n_key, n_dict_pairs in self.stripes.iteritems():

            if key > n_key:
                continue

            self.counter += 1
            if self.counter % 1000 == 0:
                self.set_status('# of Distances Calculated: {0}'.format(
                    self.counter))

            distance = None

            if dist_type == 'euclid':

                # Calculate Euclidean Distance
                squared_distance = 0
                for k in n_dict_pairs.keys():
                    squared_distance += (dict_pairs.get(k, 0) -
                                         n_dict_pairs.get(k, 0))**2

                distance = math.sqrt(squared_distance)

            if dist_type == 'cosine':

                # Calculate Cosine Distance
                # Get the intersection of keys from both stripes
                norm_x = 0
                norm_y = 0
                dot_x_y = 0
                for k in self.stripes.keys(
                ):  # Iterate through entire key range once
                    norm_x += dict_pairs.get(k, 0) * dict_pairs.get(k, 0)
                    norm_y += n_dict_pairs.get(k, 0) * n_dict_pairs.get(k, 0)
                    dot_x_y += dict_pairs.get(k, 0) * n_dict_pairs.get(k, 0)

                distance = float(dot_x_y) / (math.sqrt(norm_x) *
                                             math.sqrt(norm_y))

            self.increment_counter('distance',
                                   'num_{0}_distances'.format(dist_type),
                                   amount=1)
            yield (distance), (key, n_key)
Beispiel #27
0
 def reducer(self, idx, inputdata): 
     centroids = []
     k = int(get_jobconf_value('k'))
     num = [0] * k
     for i in range(k):
         centroids.append([0 for i in xrange(1000)])
     for d, n in inputdata:
         num[idx] = num[idx] + n
         for i in xrange(1000):
             centroids[idx][i] = centroids[idx][i] + d[i]
     for i in xrange(1000):
         centroids[idx][i] = centroids[idx][i]/num[idx]
    
     with open('Centroids.txt', 'a') as f:
         f.writelines(",".join(str(i) for i in centroids[idx]) + '\n')
     yield idx,(centroids[idx], num)
Beispiel #28
0
    def mapper(self, _, line):
        dist_type = get_jobconf_value('dist_type')
        tokens = line.strip().split('\t')
        
        key = tokens[0].replace("\"","")
        dict_pairs = ast.literal_eval(tokens[1])
        
        for n_key, n_dict_pairs in self.stripes.iteritems():
            # TODO distance calc for only (a,b) but not (b,a) --> Redundant
            if key > n_key:
                continue
            
            self.counter += 1   
            if self.counter % 1000 == 0:
                self.set_status('# of Distances Calculated: {0}'.format(self.counter))
                
            distance = None
            
            if dist_type == 'euclid':

                # Calculate Euclidean Distance
                squared_distance = 0
                for k in n_dict_pairs.keys():
                    squared_distance += (dict_pairs.get(k, 0) - n_dict_pairs.get(k, 0)) ** 2
                    
                distance = math.sqrt(squared_distance)
                
            if dist_type == 'cosine':
                
                # Calculate Cosine Distance
                # Get the intersection of keys from both stripes
                norm_x = 0
                norm_y = 0
                dot_x_y = 0
                for k in self.stripes.keys(): # Iterate through entire key range once
                    norm_x += dict_pairs.get(k,0) * dict_pairs.get(k,0)
                    norm_y += n_dict_pairs.get(k,0) * n_dict_pairs.get(k,0)
                    dot_x_y += dict_pairs.get(k,0) * n_dict_pairs.get(k,0)
                    
                distance = float(dot_x_y) / (math.sqrt(norm_x) * math.sqrt(norm_y))
          
            self.increment_counter('distance', 'num_{0}_distances'.format(dist_type), amount=1)
            yield (distance), (key, n_key)
Beispiel #29
0
 def mapper_dangling_init(self):
     i = int(get_jobconf_value('iteration'))
     
     # Page/Topic Mapping & Topic Counts for each Topic.
     self.topics = {}
     self.topicCounts = {}
     with open('randNet_topics.txt') as f:
         for l in f:
             t = l.split('\t')
             self.topics[t[0].strip()] = t[1].strip()
             
     for k,v in self.topics.iteritems():
         self.topicCounts[v] = self.topicCounts.get(v, 0) + 1
     
     self.dangling_mass = 0
     f_dangling = '/Users/ssatpati/0-DATASCIENCE/DEV/github/ml/w261/wk9/dangling.txt'
     try:
         with open(f_dangling, 'r') as f:
             l = f.readlines()
             if l:
                 self.dangling_mass = float(l[0].split('\t')[1])
         open(f_dangling, 'w').close()
     except Exception as e:
         pass
class PageRank_AWS(MRJob):

    iteration = get_jobconf_value('iteration')

    #OUTPUT_PROTOCOL = RawValueProtocol

    alpha = 0.85

    def steps(self):
        return [
            MRStep(mapper_init=self.mapper1_init,
                   mapper=self.mapper1,
                   mapper_final=self.mapper1_final,
                   reducer_init=self.reducer1_init,
                   reducer=self.reducer1,
                   reducer_final=self.reducer1_final),
            MRStep(mapper=self.mapper2, reducer=self.reducer2)
        ]

    def mapper1_init(self):
        ## Associative array for in-mapper combining
        self.nodes = {}
        self.nodes_prev = {}
        ## Keep track of nodes in the graph
        self.list_nodes = []
        self.list_sources = []

    def mapper1(self, _, line):
        ## Format of each line:
        ## n1 TAB {n2: 1, n3:1, ...} TAB PR(n1)
        ## The 3rd field is not present in the 1st iteration
        ## which only contains each node and its adjacency list
        #line = line.strip().split('\t')
        ## 1st field is a node that acts as source
        ## (Though from 2nd iteration on all nodes will be present,
        ## some with an empty list of outgoing links)
        #source = line[0]
        ## 2nd field is a dictionary of links with their weights
        ## (Set to 1 from 2nd iteration on; they're not relevant anyway)
        #sink = ast.literal_eval(line[1])
        ## Keep only the sinks, not the weights
        #sinks = sink.keys()
        ## Include those sinks in a list of nodes

        line = line.strip().split('\t')
        source = line[0].strip('"')
        if isinstance(ast.literal_eval(line[1]), dict):
            sinks = ast.literal_eval(line[1]).keys()
            if len(line) < 3:
                PR = 1e-3
                for sink in sinks:
                    self.nodes_prev[sink] = PR
            else:
                PR = float(line[2])
                self.nodes_prev[source] = PR
        else:
            sinks = ast.literal_eval(line[1])[0].keys()
            PR = float(ast.literal_eval(line[1])[1])
            self.nodes_prev[source] = PR

        for sink in sinks:
            if sink not in self.list_nodes:
                self.list_nodes.append(sink)

        ## Pass the graph structure (the adjacency list)
        yield source, [sinks]

        ## Include the source in the list of nodes, too
        if source not in self.list_nodes:
            self.list_nodes.append(source)
        # And also in a list of sources... if it really has outgoing links!
        if source not in self.list_sources and len(sinks) != 0:
            self.list_sources.append(source)
        ## If PR of the source is not present (1st iteration)
        #if len(line) < 3:
        ## All nodes have an initial PR of 1e-3
        ## The value can be any (not necessarily 1/|G|)
        ## (We don't know the value of |G| yet!!!)
        ## Just takes more or less to converge
        ## and the sum of PRs in the first iterations
        ## will be less than 100%
        #    PR = 1e-3
        ## Keep track of the previous PR to distribute the PR mass
        ## of the dangling nodes
        ## (They are never sources, but have to be sinks)
        #    for sink in sinks:
        #        self.nodes_prev[sink] = PR
        #else:
        ## From 2nd iteration on, we already know the PR
        #    PR = float(line[2])
        #    self.nodes_prev[source] = PR

        ## Distribute the mass of the source along its sinks
        ## We put the value in the associative array
        ## and emit it in the in-mapper combiner
        for node in sinks:
            self.nodes.setdefault(node, 0.)
            self.nodes[node] = self.nodes[node] + PR / len(sinks)

    def mapper1_final(self):
        ## For all nodes detected
        for node in self.list_nodes:
            ## If they have ingoing links, emit their PR
            ## as well as the total number of nodes (|G|)
            if node in self.nodes.keys():
                yield node, [self.nodes[node], len(self.list_nodes), 0.]
            ## If not a source (i.e., a DANGLING NODE) emit its previos PR
            ## to be distributed; otherwise, 0
            if node in self.nodes_prev.keys():
                yield node, [0., len(self.list_nodes), self.nodes_prev[node]]
            ## If not (they are sources but not sinks), their PR will be 0
            ## (before considering dangling nodes & teleportation)
            else:
                yield node, [0., len(self.list_nodes), 0.]

        ## 1st mapper emits each node as key, and 3 values per node
        ## Current PR (w/o considering dangling nodes and teleportation yet)
        ## |G|: number of nodes in the graph
        ## Previous PR if node is a dangling one, 0 otherwise
        ## Also (part of) the structure of the graph (adjacency lists)

    def reducer1_init(self):
        ## Keep track of nodes in the graph
        self.nodes = {}
        self.dangling_nodes_mass = 0

    def reducer1(self, key, value):
        ## Variables to keep track / aggregate PRs, number of nodes, sinks, etc.
        PR = 0.
        num_nodes = 0
        sinks = {}
        outlinks = []

        if key not in self.nodes.keys():
            self.nodes.setdefault(key, [])

        prev_mass = 0.
        for v in value:
            node_type = 'sink'
            ## When the value is the graph structure (outlinks of a node)
            if isinstance(v[0], list):
                outlinks = v[0]
                if len(outlinks) != 0:
                    node_type = 'source'
            ## When the value is the mass passed by a neighbor linking to the
            ## node (as well as the number of nodes and previous PR in
            ## case of a dangling node)
            else:
                PR = PR + v[0]
                # num_nodes = v[1]
                prev_mass = prev_mass + v[2]
        if node_type == 'sink':
            self.dangling_nodes_mass = self.dangling_nodes_mass + prev_mass

        ## Add weights to the adjacency list to be consistent with the original
        ## file structure
        for node in outlinks:
            sinks[node] = 1

        ## The 1st job emits each node as key, and 4 values
        ## its adjacency list
        ## its current PageRank
        ## the total number of nodes found in the graph
        ## the previous PR in case of a dangling node
        self.nodes[key] = [sinks, PR, num_nodes]

    def reducer1_final(self):
        num_nodes = len(self.nodes.keys())
        #conn = boto.connect_s3()

        aws_access_key_id = get_jobconf_value('aws_access_key_id')
        aws_secret_access_key = get_jobconf_value('aws_secret_access_key')
        conn = S3Connection(aws_access_key_id, aws_secret_access_key)
        mybucket = conn.get_bucket('ucb-mids-mls-juanjocarin')
        k = Key(mybucket)
        mykey = 'num_mass{}'.format(self.iteration)
        k.key = mykey
        k.set_contents_from_string(
            str(num_nodes) + '\t' + str(self.dangling_nodes_mass))
        for k, v in self.nodes.iteritems():
            yield k, [v[0], v[1]]

    #############
    ## 2nd JOB ##
    #############

    def mapper2_init(self):
        ## Associative array for in-mapper combining
        self.nodes = {}
        ## Variable to keep track of the "lost" mass
        #self.dangling_nodes_mass = 0

    def mapper2(self, key, value):
        ## Aggregate (previous!) PR mass of the dangling nodes
        #self.dangling_nodes_mass = sum_log(self.dangling_nodes_mass, value[3])
        ## Don't need the 4th value anymore
        #self.nodes[key] = value[:3]
        yield key, value

    def mapper2_final(self):
        ## Emit the associative array
        for node in self.nodes.keys():
            value = self.nodes[node]
            ## Include the PR mass of dangling nodes again
            ## But now it's the total mass, and included in every node
            #value.append(self.dangling_nodes_mass)
            #yield node, value

    def reducer2(self, key, value):
        mass = 0.
        num_nodes = 0

        #conn = boto.connect_s3()
        aws_access_key_id = get_jobconf_value('aws_access_key_id')
        aws_secret_access_key = get_jobconf_value('aws_secret_access_key')
        conn = S3Connection(aws_access_key_id, aws_secret_access_key)

        mybucket = conn.get_bucket('ucb-mids-mls-juanjocarin')
        k = Key(mybucket)
        mykey = 'num_mass{}'.format(self.iteration)
        k.key = mykey
        data = k.get_contents_as_string()
        data = data.strip().split('\t')
        num_nodes = num_nodes + int(data[0])
        mass = mass + float(data[1])
        for v in value:
            ## 1) Its adjacency list
            sinks = v[0]
            ## 2) The corrected PR
            ## PR = alpha * PR + alpha * m / |G| + (1-alpha) * (1/|G|)
            PR = self.alpha * (v[1] + mass / num_nodes)
            PR = PR + (1 - self.alpha) / num_nodes
        yield key, [sinks, PR]
Beispiel #31
0
 def mapper_init(self):
     self.cui_idx = int(get_jobconf_value("cui_idx"))
Beispiel #32
0
 def mapper(self, _, line):
     for word in line.split():
         yield word, get_jobconf_value('map.input.file')
 def mapper_init(self):
   self.cui_idx = int(get_jobconf_value("cui_idx"))
Beispiel #34
0
 def mapper_init(self):
     self.start_node = get_jobconf_value('start_node')
     self.stop_node = get_jobconf_value('stop_node')
     sys.stderr.write('### Start/Frontier Node: {0}\n'.format(self.start_node))
     sys.stderr.write('### Stop: {0}\n'.format(self.stop_node))
Beispiel #35
0
 def test_get_jobconf_value_2(self):
     os.environ['mapreduce_job_user_name'] = 'Edsger W. Dijkstra'
     self.assertEqual(get_jobconf_value('user.name'),
                      'Edsger W. Dijkstra')
     self.assertEqual(get_jobconf_value('mapreduce.job.user.name'),
                      'Edsger W. Dijkstra')
Beispiel #36
0
 def test_get_jobconf_value_1(self):
     os.environ['user_name'] = 'Edsger W. Dijkstra'
     assert_equal(get_jobconf_value('user.name'),
                  'Edsger W. Dijkstra')
     assert_equal(get_jobconf_value('mapreduce.job.user.name'),
                  'Edsger W. Dijkstra')
Beispiel #37
0
 def test_get_jobconf_value_2(self):
     os.environ['mapreduce_job_user_name'] = 'Edsger W. Dijkstra'
     self.assertEqual(get_jobconf_value('user.name'), 'Edsger W. Dijkstra')
     self.assertEqual(get_jobconf_value('mapreduce.job.user.name'),
                      'Edsger W. Dijkstra')
Beispiel #38
0
    def reducer(self, key, values):
        """
        input: -1, (index, valarray)
        output: global variance exodus file
        """
        
        val_order = {}
        
        for i, value in enumerate(values):
            val_order[value[0]]=value[1]
            
            
        val = [ ]  
        for k,value in sorted(val_order.iteritems()):
            val.extend(value)
        val2 = np.array(val)
        
        # grab template exodus file from HDFS
        
        tmpstr = self.indir[7:]
        index = tmpstr.find('/')
        prefix = 'hdfs://'+tmpstr[0:index]
        
        cmd = 'hadoop fs -ls '+ self.indir
        p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        content = p.stdout.read()
        files = content.split('\n')
        
        flag = True

        for file in files:
            file = file.split(' ')
            fname = file[len(file)-1]
            if fname.endswith('.e'):
                fname = prefix + fname
                if flag:
                    check_call(['hadoop', 'fs', '-copyToLocal', fname, 'template.e'])
                    flag = False
                    break
        
        template = 'template.e'
        
        # create new interpolation exodus file
        
        if call(['test', '-e', template]) != 0:
            print >>sys.stderr,  "The template file doesnot exist!"
            yield key,1
        else: 
        
            print >>sys.stderr,  "Reading templatefile %s"%(template)
            templatefile = ep.ExoFile(template,'r')
            
            outfile = self.outputname+'.e'
            print >>sys.stderr, "Writing outputfile %s"%(os.path.join(outfile))
            newfile = ep.ExoFile(os.path.join(outfile),'w')  
            
            time_steps = np.array([0.0])
            templatefile.change_nodal_vars2(newfile, time_steps, [self.variable], [val2], ['d'])

            newfile.src.sync()
            newfile.close()

            print >>sys.stderr, "Finished writing data, copying to Hadoop"
            
            user = get_jobconf_value('mapreduce.job.user.name')
            call(['hadoop', 'fs', '-copyFromLocal', outfile, os.path.join(self.outdir,outfile)])
            call(['hadoop', 'fs', '-chown', '-R', user, os.path.join(self.outdir)])
            
            print >>sys.stderr, "Copied to Hadoop, removing ..."
            
            call(['rm', template])
            call(['rm', outfile])
            yield key,0
            
            print >>sys.stderr, "Done"
Beispiel #39
0
 def mapper(self, _, line):
     for jobconf in JOBCONF_LIST:
         yield (jobconf, get_jobconf_value(jobconf))
Beispiel #40
0
 def mapper(self, _, line):
     for word in WORD_RE.findall(line):
         yield (get_jobconf_value("mapreduce.map.input.file"), 1)
Beispiel #41
0
 def test_get_jobconf_value_1(self):
     os.environ['user_name'] = 'Edsger W. Dijkstra'
     assert_equal(get_jobconf_value('user.name'), 'Edsger W. Dijkstra')
     assert_equal(get_jobconf_value('mapreduce.job.user.name'),
                  'Edsger W. Dijkstra')