def reducer2(self, key, value): mass = 0. num_nodes = 0 #conn = boto.connect_s3() aws_access_key_id = get_jobconf_value('aws_access_key_id') aws_secret_access_key = get_jobconf_value('aws_secret_access_key') conn = S3Connection(aws_access_key_id, aws_secret_access_key) mybucket = conn.get_bucket('ucb-mids-mls-juanjocarin') k = Key(mybucket) mykey = 'num_mass{}'.format(self.iteration) k.key = mykey data = k.get_contents_as_string() data = data.strip().split('\t') num_nodes = num_nodes + int(data[0]) mass = mass + float(data[1]) for v in value: ## 1) Its adjacency list sinks = v[0] ## 2) The corrected PR ## PR = alpha * PR + alpha * m / |G| + (1-alpha) * (1/|G|) PR = self.alpha * (v[1] + mass / num_nodes) PR = PR + (1 - self.alpha) / num_nodes yield key, [sinks, PR]
def mapper_dangling(self, key, value): # Topic of Current Node topic = get_jobconf_value('topic') # Number of Nodes in same Topic as current Node n_nodes_topic = self.topicCounts.get(topic, 0) #sys.stderr.write('[M_D] {0}, {1}, {2} \n'.format(key, topic, n_nodes_topic)) i = int(get_jobconf_value('iteration')) key = key.replace("\"","") key = key.replace("\\","") adj_list = ast.literal_eval(str(value)) nodes = int(get_jobconf_value('nodes')) teleportation = float(get_jobconf_value('teleportation')) topic_bias = float(get_jobconf_value('topic_bias')) score = adj_list['score'] ''' Adjust for Topic Bias Random Surfer selects Nodes in same Topic as current node using a Topic Bias (> 0.5: Topic Sensitive) ''' if topic != '0': random_topic_jump = teleportation * ((topic_bias/n_nodes_topic) + ((1 - topic_bias)/ (nodes - n_nodes_topic))) modified_score = random_topic_jump + (1 - teleportation) * ((self.dangling_mass / nodes) + score) else: modified_score = (teleportation / nodes) + (1 - teleportation) * ((self.dangling_mass / nodes) + score) #modified_score = (teleportation / nodes) + (1 - teleportation) * ((self.dangling_mass / nodes) + score) #modified_score = sum_log((teleportation / nodes), (1 - teleportation)*(self.dangling_mass / nodes)) #modified_score = sum_log(modified_score, (1 - teleportation)*score) adj_list['score'] = modified_score yield key, adj_list
def mapper_MulMat(self, key, value): if 'GModMat' == key: tmpRow = row = value[0] tmpCol = col = value[1] val = value[2] GroupID_row, tmp = tmpRow.split("_") GroupID_col, tmp = tmpCol.split("_") assert GroupID_row == GroupID_col, "GroupID_row and GroupID_col must be same" if row == col and 1 == self.options.iteration: maxColumnSum = float(get_jobconf_value("MAXSUM_" + GroupID_row)) val = val + maxColumnSum #yield col, ('A', row, val) # For A matrix #yield row, ('B', col, val) # For B matrix #yield col + "|" + 'A' + "|" + row, val # For A matrix #yield row + "|" + 'B' + "|" + col, val # For B matrix matSize = int(get_jobconf_value("matSize_" + GroupID_row)) # For A for i in range(matSize): midKey = get_jobconf_value("matIdx_" + GroupID_row + "_" + str(i)) # matIdx_0_0=0_20 ... yield row + "|" + midKey + "|" + col, val # row, i, col yield midKey + "|" + col + "|" + row, val # i, col, row
def mapper(self, key, value): nodes = int(get_jobconf_value('nodes')) i = int(get_jobconf_value('iteration')) #sys.stderr.write('[M] {0}, {1} \n'.format(key, value)) key = key.replace("\"","") key = key.replace("\\","") adj_list = ast.literal_eval(value) score = 0 l = 0 if 'score' in adj_list.keys(): # Previous Mass/Page Rank score = adj_list['score'] l = len(adj_list) - 1 else: # First iteration ('score' not yet part of the adjacency list!) # Start with uniform probability distribution score = 1.0 / nodes l = len(adj_list) adj_list['score'] = score if l == 0: # Only 'score' & no out links [Dangling!] sys.stderr.write('[{0}][M] "DANGLING MASS" | {1} | {2}\n'.format(i, key, score)) # Emit using a special key; Accumlate in Reducer;Distribute in the next MRJob yield 'DANGLING', ('SCORE', score) # Emit the Graph Structure yield key, ('GRAPH', adj_list) # Emit the new Mass/Page Rank for n in adj_list: if n != 'score': yield n, ('SCORE', score/l)
def mapper(self, key, value): nodes = int(get_jobconf_value('nodes')) i = int(get_jobconf_value('iteration')) #sys.stderr.write('[M] {0}, {1} \n'.format(key, value)) key = key.replace("\"", "") key = key.replace("\\", "") adj_list = ast.literal_eval(value) score = 0 l = 0 if 'score' in adj_list.keys(): # Previous Mass/Page Rank score = adj_list['score'] l = len(adj_list) - 1 else: # First iteration ('score' not yet part of the adjacency list!) # Start with uniform probability distribution score = 1.0 / nodes l = len(adj_list) adj_list['score'] = score if l == 0: # Only 'score' & no out links [Dangling!] sys.stderr.write('[{0}][M] "DANGLING MASS" | {1} | {2}\n'.format( i, key, score)) # Emit using a special key; Accumlate in Reducer;Distribute in the next MRJob yield 'DANGLING', ('SCORE', score) # Emit the Graph Structure yield key, ('GRAPH', adj_list) # Emit the new Mass/Page Rank for n in adj_list: if n != 'score': yield n, ('SCORE', score / l)
def reducer(self, key, values): i = int(get_jobconf_value('iteration')) teleportation = float(get_jobconf_value('teleportation')) nodes = int(get_jobconf_value('nodes')) adj_list = None total_score = 0 for value_type, value in values: if value_type == 'GRAPH': adj_list = value else: assert value_type == 'SCORE' total_score += value #total_score = sum_log(total_score, value) # Special Key if key == 'DANGLING': # Write accumulated Dangling Score in a file with open('/Users/ssatpati/0-DATASCIENCE/DEV/github/ml/w261/wk9/dangling.txt', 'w') as f: f.write('DANGLING\t{0}\n'.format(total_score)) else: #total_score = (teleportation / nodes) + ((1 - teleportation) * total_score) #total_score = sum_log((teleportation / nodes), ((1 - teleportation) * total_score)) if adj_list: adj_list['score'] = total_score else: adj_list = {'score': total_score} #sys.stderr.write('[R2] {0} | {1} | {2}\n\n'.format(key, total_score, adj_list)) yield key, adj_list
def mapper_CalcPij(self, key, value): """ Bij = Aij - ( Ki * Kj ) / M = ( ( Aij * M ) - ( Ki * Kj ) ) ----------------------------- M """ #self.startDate = get_jobconf_value("maxNodeID") if key.isdigit(): """ Input: "1" "2" "2" "1" "3" "4" "3" "5" "4" "3" "4" "5" "5" "3" "5" "4" """ row = key col = value Aij = 1.0 yield row + "_" + col, ('A', Aij) elif 'k' == key: """ "k" ["7", 4] """ row = value[0] minNodeID = int(get_jobconf_value("minNodeID")) maxNodeID = int(get_jobconf_value("maxNodeID")) k_row = float(get_jobconf_value("k" + row)) M = float(get_jobconf_value("M")) for col in range(minNodeID, maxNodeID + 1): if 'local' == RUN_TYPE: #sys.stderr.write("col %d \t |" % col) pass k_col = float(get_jobconf_value("k" + str(col))) Pij = (k_row * k_col) / M yield row + "_" + str(col), ('P', Pij) elif 'm' == key: return elif 'max' == key: return elif 'min' == key: return elif 'x' == key: return else: assert False, "In genModularityMat_mapper(), wrong key type. key : %s" % key
def mapper(self, _, line): v = line.split(',') n = (len(v)-2)/2 #number of Non-zero columns for this each i = int(get_jobconf_value("row.num.A")) # we need to know how many rows of A j = int(get_jobconf_value("col.num.B")) # we need to know how many columns of B if v[0]=='0': for p in range(n): for q in range(j): yield (int(v[1]),q), (int(v[p*2+2]),float(v[p*2+3])) elif v[0]=='1': for p in range(n): for q in range(i): yield (q,int(v[p*2+2])), (int(v[1]),float(v[p*2+3]))
def reducer1_final(self): num_nodes = len(self.nodes.keys()) #conn = boto.connect_s3() aws_access_key_id = get_jobconf_value('aws_access_key_id') aws_secret_access_key = get_jobconf_value('aws_secret_access_key') conn = S3Connection(aws_access_key_id, aws_secret_access_key) mybucket = conn.get_bucket('ucb-mids-mls-juanjocarin') k = Key(mybucket) mykey = 'num_mass{}'.format(self.iteration) k.key = mykey k.set_contents_from_string( str(num_nodes) + '\t' + str(self.dangling_nodes_mass)) for k, v in self.nodes.iteritems(): yield k, [v[0], v[1]]
def mapper_init(self): self.frontier_node = get_jobconf_value('frontier_node') if not self.frontier_node: # Save a list of visited nodes self.visited = [s.strip() for s in open('visited.txt').readlines()] open('visited.txt', 'w').close()
def parsemat(self): """ Return 1 if this is the A matrix, otherwise return 2""" fn = get_jobconf_value('map.input.file') if self.options.Amatname in fn: return 1 else: return 2
def mapper(self, _, line): # step 0: strip off unexpected characters line = line.split('\t')[1] # step 1: fetch the exodus file from Hadoop cluster file = os.path.basename(line) if os.path.isfile(os.path.join('./', file)): call(['rm', os.path.join('./', file)]) check_call(['hadoop', 'fs', '-copyToLocal', line, os.path.join('./', file)]) outdir = os.path.basename(line) ind = outdir.rfind('.') outdir = outdir[0:ind] if os.path.isdir(os.path.join('./', outdir)): call(['rm', '-r', os.path.join('./', outdir)]) call(['mkdir', os.path.join('./', outdir)]) # step 2: do our local processing result = convert(os.path.join('./', file), self.timesteps, os.path.join('./', outdir), self.variables) # step3: write back to Hadoop cluster user = get_jobconf_value('mapreduce.job.user.name') for fname in os.listdir(os.path.join('./', outdir)): if call(['hadoop', 'fs', '-test', '-e', os.path.join(self.outdir,outdir,fname)]) == 0: call(['hadoop', 'fs', '-rm', os.path.join(self.outdir,outdir,fname)]) call(['hadoop', 'fs', '-copyFromLocal', os.path.join('./',outdir,fname),os.path.join(self.outdir,outdir,fname)]) call(['hadoop', 'fs', '-chown', '-R', user, os.path.join(self.outdir)]) call(['rm', os.path.join('./', file)]) call(['rm', '-r', os.path.join('./', outdir)]) #step 4: yield output key/value if result == True: yield (line, 0) else: yield (line, 1)
def mapper_CalcPij(self, key, value): """ Bij = Aij - ( Ki * Kj ) / M = ( ( Aij * M ) - ( Ki * Kj ) ) ----------------------------- M """ if key.isdigit(): """ Input: "1" "2" "2" "1" "3" "4" "3" "5" "4" "3" "4" "5" "5" "3" "5" "4" """ row = key col = value Aij = 1.0 yield row + "_" + col + "_" + "A", (Aij) elif 'k' == key: """ "k" ["7", 4] """ row = value[0] k_row = float(get_jobconf_value("k" + row)) for col in range(self.options.minNodeID, self.options.maxNodeID + 1): if 'local' == RUN_TYPE: #sys.stderr.write("col %d \t |" % col) pass k_col = float(get_jobconf_value("k" + str(col))) Pij = (k_row * k_col) / self.options.M yield row + "_" + str(col) + "_" + "P", (Pij) elif key in ['m', 'max', 'min', 'x']: return else: assert False, "In genModularityMat_mapper(), wrong key type. key : %s" % key
def mapper(self, _, line): # step 0: strip off unexpected characters line = line.split('\t')[1] # step 1: fetch the exodus file from Hadoop cluster file = os.path.basename(line) if os.path.isfile(os.path.join('./', file)): call(['rm', os.path.join('./', file)]) check_call( ['hadoop', 'fs', '-copyToLocal', line, os.path.join('./', file)]) outdir = os.path.basename(line) ind = outdir.rfind('.') outdir = outdir[0:ind] if os.path.isdir(os.path.join('./', outdir)): call(['rm', '-r', os.path.join('./', outdir)]) call(['mkdir', os.path.join('./', outdir)]) # step 2: do our local processing if self.timestepfile is None: lines = None else: f = open(os.path.basename(self.timestepfile)) lines = f.readlines() for i in xrange(0, len(lines)): lines[i] = float(lines[i].strip()) result = convert(os.path.join('./', file), self.timesteps, os.path.join('./', outdir), self.variables, lines) # step3: write back to Hadoop cluster user = get_jobconf_value('mapreduce.job.user.name') for fname in os.listdir(os.path.join('./', outdir)): if call([ 'hadoop', 'fs', '-test', '-e', os.path.join(self.outdir, outdir, fname) ]) == 0: call([ 'hadoop', 'fs', '-rm', os.path.join(self.outdir, outdir, fname) ]) call([ 'hadoop', 'fs', '-copyFromLocal', os.path.join('./', outdir, fname), os.path.join(self.outdir, outdir, fname) ]) call([ 'hadoop', 'fs', '-chown', '-R', user, os.path.join(self.outdir) ]) call(['rm', os.path.join('./', file)]) call(['rm', '-r', os.path.join('./', outdir)]) #step 4: yield output key/value if result == True: yield (line, 0) else: yield (line, 1)
def mapper_dangling(self, key, value): #sys.stderr.write('[M_D] {0}, {1} \n'.format(key, value)) i = int(get_jobconf_value('iteration')) key = key.replace("\"","") key = key.replace("\\","") adj_list = ast.literal_eval(str(value)) if self.dangling_mass > 0: nodes = int(get_jobconf_value('nodes')) teleportation = float(get_jobconf_value('teleportation')) score = adj_list['score'] modified_score = (teleportation / nodes) + (1 - teleportation) * ((self.dangling_mass / nodes) + score) #modified_score = sum_log((teleportation / nodes), (1 - teleportation)*(self.dangling_mass / nodes)) #modified_score = sum_log(modified_score, (1 - teleportation)*score) adj_list['score'] = modified_score yield key, adj_list
def mapper(self, key, value): nodes = int(get_jobconf_value('nodes')) dangling_mass = float(get_jobconf_value('dangling_mass')) teleportation = float(get_jobconf_value('teleportation')) #sys.stderr.write('[M] {0}, {1} \n'.format(key, value)) key = key.replace("\"","") key = key.replace("\\","") neighbors = ast.literal_eval(value) score = float(neighbors['score']) modified_score = teleportation / nodes + (1 - teleportation) * ( (dangling_mass / nodes) + score) print '{0}, {1}, {2}'.format(score, modified_score, dangling_mass) neighbors['score'] = modified_score yield key, neighbors
def mapper_dangling(self, key, value): #sys.stderr.write('[M_D] {0}, {1} \n'.format(key, value)) i = int(get_jobconf_value('iteration')) key = key.replace("\"", "") key = key.replace("\\", "") adj_list = ast.literal_eval(str(value)) if self.dangling_mass > 0: nodes = int(get_jobconf_value('nodes')) teleportation = float(get_jobconf_value('teleportation')) score = adj_list['score'] modified_score = (teleportation / nodes) + (1 - teleportation) * ( (self.dangling_mass / nodes) + score) #modified_score = sum_log((teleportation / nodes), (1 - teleportation)*(self.dangling_mass / nodes)) #modified_score = sum_log(modified_score, (1 - teleportation)*score) adj_list['score'] = modified_score yield key, adj_list
def mapper(self, _, l): t = l.strip('\n').split('\t') text = t[1] i = int(t[0]) n = int(get_jobconf_value("total")) for j in range(1, i): yield(("%d,%d" % (i, j)), text) for j in range(i + 1, n + 1): yield(("%d,%d" % (j, i)), text)
def mapper_dangling_init(self): i = int(get_jobconf_value('iteration')) aws_access_key_id = get_jobconf_value('aws_access_key_id') aws_secret_access_key = get_jobconf_value('aws_secret_access_key') self.dangling_mass = 0 # Read Dangling Mass from S3 Bucket try: conn = boto.connect_s3() bucket = conn.get_bucket('ucb-mids-mls-juanjocarin') k = Key(bucket) k.key = 'hw93/dangling_mass/{0}'.format(i) # Same as iteration self.dangling_mass = float(k.get_contents_as_string()) except boto.exception.S3ResponseError as err: sys.stderr.write(err) sys.exit(1) sys.stderr.write('[{0}][M_D] DANGLING MASS: {1}\n'.format(i, self.dangling_mass))
def mapper(self, _, line): v = line.split(',') n = (len(v) - 2) / 2 #number of Non-zero columns for this each i = int(get_jobconf_value( "row.num.A")) # we need to know how many rows of A j = int(get_jobconf_value( "col.num.B")) # we need to know how many columns of B if v[0] == '0': for p in range(n): for q in range(j): yield (int(v[1]), q), (int(v[p * 2 + 2]), float(v[p * 2 + 3])) elif v[0] == '1': for p in range(n): for q in range(i): yield (q, int(v[p * 2 + 2])), (int(v[1]), float(v[p * 2 + 3]))
def mapper_dangling_init(self): i = int(get_jobconf_value('iteration')) aws_access_key_id = get_jobconf_value('aws_access_key_id') aws_secret_access_key = get_jobconf_value('aws_secret_access_key') self.dangling_mass = 0 # Read Dangling Mass from S3 Bucket try: conn = boto.connect_s3() bucket = conn.get_bucket('ucb-mids-mls-juanjocarin') k = Key(bucket) k.key = 'hw93/dangling_mass/{0}'.format(i) # Same as iteration self.dangling_mass = float(k.get_contents_as_string()) except boto.exception.S3ResponseError as err: sys.stderr.write(err) sys.exit(1) sys.stderr.write('[{0}][M_D] DANGLING MASS: {1}\n'.format( i, self.dangling_mass))
def reducer(self, key, values): i = int(get_jobconf_value('iteration')) teleportation = float(get_jobconf_value('teleportation')) nodes = int(get_jobconf_value('nodes')) aws_access_key_id = get_jobconf_value('aws_access_key_id') aws_secret_access_key = get_jobconf_value('aws_secret_access_key') adj_list = None total_score = 0 for value_type, value in values: if value_type == 'GRAPH': adj_list = value else: assert value_type == 'SCORE' total_score += value #total_score = sum_log(total_score, value) # Write Special Key to S3 if key == 'DANGLING': # Write accumulated Dangling Score in a S3 Key try: conn = boto.connect_s3() bucket = conn.get_bucket('ucb-mids-mls-juanjocarin') k = Key(bucket) k.key = 'hw93/dangling_mass/{0}'.format(i) # Same as iteration k.set_contents_from_string(str(total_score)) except boto.exception.S3ResponseError as err: sys.stderr.write(err) sys.exit(1) else: #total_score = (teleportation / nodes) + ((1 - teleportation) * total_score) #total_score = sum_log((teleportation / nodes), ((1 - teleportation) * total_score)) if adj_list: adj_list['score'] = total_score else: adj_list = {'score': total_score} #sys.stderr.write('[R2] {0} | {1} | {2}\n\n'.format(key, total_score, adj_list)) yield key, adj_list
def mapper_dangling_init(self): i = int(get_jobconf_value('iteration')) self.dangling_mass = 0 f_dangling = '/Users/ssatpati/0-DATASCIENCE/DEV/github/ml/w261/wk9/dangling.txt' try: with open(f_dangling, 'r') as f: l = f.readlines() if l: self.dangling_mass = float(l[0].split('\t')[1]) open(f_dangling, 'w').close() except Exception as e: pass sys.stderr.write('[{0}][M_D] DANGLING MASS: {1}\n'.format(i, self.dangling_mass))
def mapper(self, _, line): dist_type = get_jobconf_value('dist_type') tokens = line.strip().split('\t') key = tokens[0].replace("\"", "") dict_pairs = ast.literal_eval(tokens[1]) for n_key, n_dict_pairs in self.stripes.iteritems(): if key > n_key: continue self.counter += 1 if self.counter % 1000 == 0: self.set_status('# of Distances Calculated: {0}'.format( self.counter)) distance = None if dist_type == 'euclid': # Calculate Euclidean Distance squared_distance = 0 for k in n_dict_pairs.keys(): squared_distance += (dict_pairs.get(k, 0) - n_dict_pairs.get(k, 0))**2 distance = math.sqrt(squared_distance) if dist_type == 'cosine': # Calculate Cosine Distance # Get the intersection of keys from both stripes norm_x = 0 norm_y = 0 dot_x_y = 0 for k in self.stripes.keys( ): # Iterate through entire key range once norm_x += dict_pairs.get(k, 0) * dict_pairs.get(k, 0) norm_y += n_dict_pairs.get(k, 0) * n_dict_pairs.get(k, 0) dot_x_y += dict_pairs.get(k, 0) * n_dict_pairs.get(k, 0) distance = float(dot_x_y) / (math.sqrt(norm_x) * math.sqrt(norm_y)) self.increment_counter('distance', 'num_{0}_distances'.format(dist_type), amount=1) yield (distance), (key, n_key)
def reducer(self, idx, inputdata): centroids = [] k = int(get_jobconf_value('k')) num = [0] * k for i in range(k): centroids.append([0 for i in xrange(1000)]) for d, n in inputdata: num[idx] = num[idx] + n for i in xrange(1000): centroids[idx][i] = centroids[idx][i] + d[i] for i in xrange(1000): centroids[idx][i] = centroids[idx][i]/num[idx] with open('Centroids.txt', 'a') as f: f.writelines(",".join(str(i) for i in centroids[idx]) + '\n') yield idx,(centroids[idx], num)
def mapper(self, _, line): dist_type = get_jobconf_value('dist_type') tokens = line.strip().split('\t') key = tokens[0].replace("\"","") dict_pairs = ast.literal_eval(tokens[1]) for n_key, n_dict_pairs in self.stripes.iteritems(): # TODO distance calc for only (a,b) but not (b,a) --> Redundant if key > n_key: continue self.counter += 1 if self.counter % 1000 == 0: self.set_status('# of Distances Calculated: {0}'.format(self.counter)) distance = None if dist_type == 'euclid': # Calculate Euclidean Distance squared_distance = 0 for k in n_dict_pairs.keys(): squared_distance += (dict_pairs.get(k, 0) - n_dict_pairs.get(k, 0)) ** 2 distance = math.sqrt(squared_distance) if dist_type == 'cosine': # Calculate Cosine Distance # Get the intersection of keys from both stripes norm_x = 0 norm_y = 0 dot_x_y = 0 for k in self.stripes.keys(): # Iterate through entire key range once norm_x += dict_pairs.get(k,0) * dict_pairs.get(k,0) norm_y += n_dict_pairs.get(k,0) * n_dict_pairs.get(k,0) dot_x_y += dict_pairs.get(k,0) * n_dict_pairs.get(k,0) distance = float(dot_x_y) / (math.sqrt(norm_x) * math.sqrt(norm_y)) self.increment_counter('distance', 'num_{0}_distances'.format(dist_type), amount=1) yield (distance), (key, n_key)
def mapper_dangling_init(self): i = int(get_jobconf_value('iteration')) # Page/Topic Mapping & Topic Counts for each Topic. self.topics = {} self.topicCounts = {} with open('randNet_topics.txt') as f: for l in f: t = l.split('\t') self.topics[t[0].strip()] = t[1].strip() for k,v in self.topics.iteritems(): self.topicCounts[v] = self.topicCounts.get(v, 0) + 1 self.dangling_mass = 0 f_dangling = '/Users/ssatpati/0-DATASCIENCE/DEV/github/ml/w261/wk9/dangling.txt' try: with open(f_dangling, 'r') as f: l = f.readlines() if l: self.dangling_mass = float(l[0].split('\t')[1]) open(f_dangling, 'w').close() except Exception as e: pass
class PageRank_AWS(MRJob): iteration = get_jobconf_value('iteration') #OUTPUT_PROTOCOL = RawValueProtocol alpha = 0.85 def steps(self): return [ MRStep(mapper_init=self.mapper1_init, mapper=self.mapper1, mapper_final=self.mapper1_final, reducer_init=self.reducer1_init, reducer=self.reducer1, reducer_final=self.reducer1_final), MRStep(mapper=self.mapper2, reducer=self.reducer2) ] def mapper1_init(self): ## Associative array for in-mapper combining self.nodes = {} self.nodes_prev = {} ## Keep track of nodes in the graph self.list_nodes = [] self.list_sources = [] def mapper1(self, _, line): ## Format of each line: ## n1 TAB {n2: 1, n3:1, ...} TAB PR(n1) ## The 3rd field is not present in the 1st iteration ## which only contains each node and its adjacency list #line = line.strip().split('\t') ## 1st field is a node that acts as source ## (Though from 2nd iteration on all nodes will be present, ## some with an empty list of outgoing links) #source = line[0] ## 2nd field is a dictionary of links with their weights ## (Set to 1 from 2nd iteration on; they're not relevant anyway) #sink = ast.literal_eval(line[1]) ## Keep only the sinks, not the weights #sinks = sink.keys() ## Include those sinks in a list of nodes line = line.strip().split('\t') source = line[0].strip('"') if isinstance(ast.literal_eval(line[1]), dict): sinks = ast.literal_eval(line[1]).keys() if len(line) < 3: PR = 1e-3 for sink in sinks: self.nodes_prev[sink] = PR else: PR = float(line[2]) self.nodes_prev[source] = PR else: sinks = ast.literal_eval(line[1])[0].keys() PR = float(ast.literal_eval(line[1])[1]) self.nodes_prev[source] = PR for sink in sinks: if sink not in self.list_nodes: self.list_nodes.append(sink) ## Pass the graph structure (the adjacency list) yield source, [sinks] ## Include the source in the list of nodes, too if source not in self.list_nodes: self.list_nodes.append(source) # And also in a list of sources... if it really has outgoing links! if source not in self.list_sources and len(sinks) != 0: self.list_sources.append(source) ## If PR of the source is not present (1st iteration) #if len(line) < 3: ## All nodes have an initial PR of 1e-3 ## The value can be any (not necessarily 1/|G|) ## (We don't know the value of |G| yet!!!) ## Just takes more or less to converge ## and the sum of PRs in the first iterations ## will be less than 100% # PR = 1e-3 ## Keep track of the previous PR to distribute the PR mass ## of the dangling nodes ## (They are never sources, but have to be sinks) # for sink in sinks: # self.nodes_prev[sink] = PR #else: ## From 2nd iteration on, we already know the PR # PR = float(line[2]) # self.nodes_prev[source] = PR ## Distribute the mass of the source along its sinks ## We put the value in the associative array ## and emit it in the in-mapper combiner for node in sinks: self.nodes.setdefault(node, 0.) self.nodes[node] = self.nodes[node] + PR / len(sinks) def mapper1_final(self): ## For all nodes detected for node in self.list_nodes: ## If they have ingoing links, emit their PR ## as well as the total number of nodes (|G|) if node in self.nodes.keys(): yield node, [self.nodes[node], len(self.list_nodes), 0.] ## If not a source (i.e., a DANGLING NODE) emit its previos PR ## to be distributed; otherwise, 0 if node in self.nodes_prev.keys(): yield node, [0., len(self.list_nodes), self.nodes_prev[node]] ## If not (they are sources but not sinks), their PR will be 0 ## (before considering dangling nodes & teleportation) else: yield node, [0., len(self.list_nodes), 0.] ## 1st mapper emits each node as key, and 3 values per node ## Current PR (w/o considering dangling nodes and teleportation yet) ## |G|: number of nodes in the graph ## Previous PR if node is a dangling one, 0 otherwise ## Also (part of) the structure of the graph (adjacency lists) def reducer1_init(self): ## Keep track of nodes in the graph self.nodes = {} self.dangling_nodes_mass = 0 def reducer1(self, key, value): ## Variables to keep track / aggregate PRs, number of nodes, sinks, etc. PR = 0. num_nodes = 0 sinks = {} outlinks = [] if key not in self.nodes.keys(): self.nodes.setdefault(key, []) prev_mass = 0. for v in value: node_type = 'sink' ## When the value is the graph structure (outlinks of a node) if isinstance(v[0], list): outlinks = v[0] if len(outlinks) != 0: node_type = 'source' ## When the value is the mass passed by a neighbor linking to the ## node (as well as the number of nodes and previous PR in ## case of a dangling node) else: PR = PR + v[0] # num_nodes = v[1] prev_mass = prev_mass + v[2] if node_type == 'sink': self.dangling_nodes_mass = self.dangling_nodes_mass + prev_mass ## Add weights to the adjacency list to be consistent with the original ## file structure for node in outlinks: sinks[node] = 1 ## The 1st job emits each node as key, and 4 values ## its adjacency list ## its current PageRank ## the total number of nodes found in the graph ## the previous PR in case of a dangling node self.nodes[key] = [sinks, PR, num_nodes] def reducer1_final(self): num_nodes = len(self.nodes.keys()) #conn = boto.connect_s3() aws_access_key_id = get_jobconf_value('aws_access_key_id') aws_secret_access_key = get_jobconf_value('aws_secret_access_key') conn = S3Connection(aws_access_key_id, aws_secret_access_key) mybucket = conn.get_bucket('ucb-mids-mls-juanjocarin') k = Key(mybucket) mykey = 'num_mass{}'.format(self.iteration) k.key = mykey k.set_contents_from_string( str(num_nodes) + '\t' + str(self.dangling_nodes_mass)) for k, v in self.nodes.iteritems(): yield k, [v[0], v[1]] ############# ## 2nd JOB ## ############# def mapper2_init(self): ## Associative array for in-mapper combining self.nodes = {} ## Variable to keep track of the "lost" mass #self.dangling_nodes_mass = 0 def mapper2(self, key, value): ## Aggregate (previous!) PR mass of the dangling nodes #self.dangling_nodes_mass = sum_log(self.dangling_nodes_mass, value[3]) ## Don't need the 4th value anymore #self.nodes[key] = value[:3] yield key, value def mapper2_final(self): ## Emit the associative array for node in self.nodes.keys(): value = self.nodes[node] ## Include the PR mass of dangling nodes again ## But now it's the total mass, and included in every node #value.append(self.dangling_nodes_mass) #yield node, value def reducer2(self, key, value): mass = 0. num_nodes = 0 #conn = boto.connect_s3() aws_access_key_id = get_jobconf_value('aws_access_key_id') aws_secret_access_key = get_jobconf_value('aws_secret_access_key') conn = S3Connection(aws_access_key_id, aws_secret_access_key) mybucket = conn.get_bucket('ucb-mids-mls-juanjocarin') k = Key(mybucket) mykey = 'num_mass{}'.format(self.iteration) k.key = mykey data = k.get_contents_as_string() data = data.strip().split('\t') num_nodes = num_nodes + int(data[0]) mass = mass + float(data[1]) for v in value: ## 1) Its adjacency list sinks = v[0] ## 2) The corrected PR ## PR = alpha * PR + alpha * m / |G| + (1-alpha) * (1/|G|) PR = self.alpha * (v[1] + mass / num_nodes) PR = PR + (1 - self.alpha) / num_nodes yield key, [sinks, PR]
def mapper_init(self): self.cui_idx = int(get_jobconf_value("cui_idx"))
def mapper(self, _, line): for word in line.split(): yield word, get_jobconf_value('map.input.file')
def mapper_init(self): self.start_node = get_jobconf_value('start_node') self.stop_node = get_jobconf_value('stop_node') sys.stderr.write('### Start/Frontier Node: {0}\n'.format(self.start_node)) sys.stderr.write('### Stop: {0}\n'.format(self.stop_node))
def test_get_jobconf_value_2(self): os.environ['mapreduce_job_user_name'] = 'Edsger W. Dijkstra' self.assertEqual(get_jobconf_value('user.name'), 'Edsger W. Dijkstra') self.assertEqual(get_jobconf_value('mapreduce.job.user.name'), 'Edsger W. Dijkstra')
def test_get_jobconf_value_1(self): os.environ['user_name'] = 'Edsger W. Dijkstra' assert_equal(get_jobconf_value('user.name'), 'Edsger W. Dijkstra') assert_equal(get_jobconf_value('mapreduce.job.user.name'), 'Edsger W. Dijkstra')
def reducer(self, key, values): """ input: -1, (index, valarray) output: global variance exodus file """ val_order = {} for i, value in enumerate(values): val_order[value[0]]=value[1] val = [ ] for k,value in sorted(val_order.iteritems()): val.extend(value) val2 = np.array(val) # grab template exodus file from HDFS tmpstr = self.indir[7:] index = tmpstr.find('/') prefix = 'hdfs://'+tmpstr[0:index] cmd = 'hadoop fs -ls '+ self.indir p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) content = p.stdout.read() files = content.split('\n') flag = True for file in files: file = file.split(' ') fname = file[len(file)-1] if fname.endswith('.e'): fname = prefix + fname if flag: check_call(['hadoop', 'fs', '-copyToLocal', fname, 'template.e']) flag = False break template = 'template.e' # create new interpolation exodus file if call(['test', '-e', template]) != 0: print >>sys.stderr, "The template file doesnot exist!" yield key,1 else: print >>sys.stderr, "Reading templatefile %s"%(template) templatefile = ep.ExoFile(template,'r') outfile = self.outputname+'.e' print >>sys.stderr, "Writing outputfile %s"%(os.path.join(outfile)) newfile = ep.ExoFile(os.path.join(outfile),'w') time_steps = np.array([0.0]) templatefile.change_nodal_vars2(newfile, time_steps, [self.variable], [val2], ['d']) newfile.src.sync() newfile.close() print >>sys.stderr, "Finished writing data, copying to Hadoop" user = get_jobconf_value('mapreduce.job.user.name') call(['hadoop', 'fs', '-copyFromLocal', outfile, os.path.join(self.outdir,outfile)]) call(['hadoop', 'fs', '-chown', '-R', user, os.path.join(self.outdir)]) print >>sys.stderr, "Copied to Hadoop, removing ..." call(['rm', template]) call(['rm', outfile]) yield key,0 print >>sys.stderr, "Done"
def mapper(self, _, line): for jobconf in JOBCONF_LIST: yield (jobconf, get_jobconf_value(jobconf))
def mapper(self, _, line): for word in WORD_RE.findall(line): yield (get_jobconf_value("mapreduce.map.input.file"), 1)