def main(): args = parse_args() combined_aps = parse_grouped_aps(args.g) with open(args.a, 'r') as infile: aps = [line.rstrip() for line in infile] data = parse_data_files(aps, args.d) print "Num records: ", len(data) sc = mapreduce() # sc = SparkContext(appName="NetflixProblemApp") result = (sc.parallelize(data, 128).map(mapper0).reduceByKey( reducer).flatMap(mapper1).reduceByKey(reducer).filter( lambda x: x[0] not in combined_aps).flatMap(mapper3).reduceByKey( reducer).sortBy(lambda x: len(x[1])).collect()) sc.stop() for record in result: date_elem, ap_conns = record output = filter(lambda x: len(x[1]) > NUM_JUMPS_THRESHOLD, ap_conns) day = str(date_elem.day) month = str(date_elem.month) year = str(date_elem.year) with open(args.o + month + '-' + day + '-' + year[2:], 'w') as outfile: json.dump(output, outfile, indent=4, encoding='latin1')
def main(): args = parse_args() raw_zones, aps = parse_grouped_aps(args.g) zones = {} for zone in raw_zones: for ap in zone['aps']: zones[ap] = str(zone['zone']) data = parse_data_files(aps, args.d, zones) print "Num records: ", len(data) sc = mapreduce() # sc = SparkContext(appName="NetflixProblemApp") result = (sc.parallelize(data, 128).map(mapper0).reduceByKey(reducer) .flatMap(mapper1).reduceByKey(reducer) .flatMap(mapper3).reduceByKey(reducer) .sortBy(lambda x: len(x[1])).collect()) sc.stop() for record in result: date_elem, ap_conns = record output = filter(lambda x: len(x[1]) > NUM_JUMPS_THRESHOLD, ap_conns) day = str(date_elem.day) month = str(date_elem.month) year = str(date_elem.year) with open(args.o + month + '-' + day + '-' + year[2:] + '.json', 'w') as outfile: json.dump(output, outfile, indent=4, encoding='latin1')
def main(): ratings = read_netflix_ratings() map_reducer = mapreduce() pipeline = map_reducer.parallelize(ratings, 128) similar_table = pipeline.map(mapper1) \ .reduceByKey(reducer) \ .flatMap(mapper2) \ .reduceByKey(reducer) \ .flatMap(mapper3) \ .reduceByKey(reducer) \ .flatMap(mapper4) \ .reduceByKey(reducer) \ .flatMap(mapper5) recommend_result = [] print( '******************************* Recommendation results ***********************************' ) for item in similar_table.collect(): recommend_result.append(item) print(item) print( '*********************************** Task 6 *****************************************' ) df = task_6(recommend_result) return df
def main(): # Get the ratings from ratings.csv ratings = read_netflix_ratings() # Initialize MapReduce map_reducer = mapreduce() pipeline = map_reducer.parallelize(ratings, 128) # DO NOT MODIFY THIS! similar_table = pipeline.map(mapper1) \ .reduceByKey(reducer) \ .flatMap(mapper2) \ .reduceByKey(reducer) \ .flatMap(mapper3) \ .reduceByKey(reducer) \ .flatMap(mapper4) \ .reduceByKey(reducer) \ .flatMap(mapper5) recommend_result = [] print('******************************* Recommendation results ***********************************') for item in similar_table.collect(): recommend_result.append(item) print(item) print('*********************************** Task 6 *****************************************') df = task_6(recommend_result) df2 = task_6_2(recommend_result) display(df2) return df
def get(self): import mapreduce il_ve_sayi = mapreduce.mapreduce() cc = self.db.kullanici.find() if self.current_user: self.render("index.html",kullanici_adi=self.get_user_name(),kullanicilar=cc,ilsayi = il_ve_sayi) else: self.render("index.html",kullanici_adi="",kullanicilar=cc,ilsayi = il_ve_sayi)
def main(): with open('condensedStats.csv', 'rb') as f: data = [line.split(',') for line in f] sc = mapreduce() result = sc.parallelize(data[1:], 128) \ .map(mapper1) \ .reduceByKey(reducer) \ .sortByKey(True) \ .collect() sc.stop() topVids = result[len(result)-51:] l = [] for vid in topVids: l.extend(vid[1][3].lower().split(';')) counter = collections.Counter(l) with open('mostCommonTags.csv', 'wb') as c: writer = csv.writer(c) writer.writerow(['Tag', 'Count']) for key,count in counter.most_common(): writer.writerow([key, count]) with open('commentsFile.csv', 'wb') as c: writer = csv.writer(c) writer.writerow(['Id', 'Title', 'Description', 'Comments (; delimited list)']) regex = re.compile('[%s]' % re.escape(string.punctuation)) for vid in topVids: try: comments = json.load(urllib2.urlopen(url + vid[1][0] + '&key=' + api_key)) except Exception as e: print(e) print(vid[1][0]) continue commentList = '' if comments['items']: thread = [] for item in comments['items']: if 'textDisplay' in item['snippet'].get('topLevelComment', {}).get('snippet', {}): comm = re.sub(r'http\S+|www.\S+|href\S+', '', item['snippet']['topLevelComment']['snippet']['textDisplay']) date = item['snippet']['topLevelComment']['snippet']['publishedAt'] # comm = ' '.join(w for w in nltk.wordpunct_tokenize(comm) if w.lower() in engWords or not w.isalpha()) thread.append(regex.sub('', comm) + '|' + date) commentList = ';'.join(thread) writer.writerow([vid[1][0], vid[1][1], vid[1][2], commentList.encode('utf8').decode('unicode_escape').encode('ascii','ignore')])
def word_count(title_text_pairs, verbose=False): def map_f(_title, text): for word in text.split(' '): yield((word, 1)) def reduce_f(word, counts): yield((word, sum(counts))) return mapreduce(map_f=map_f, combine_f=reduce_f, reduce_f=reduce_f, verbose=verbose)(title_text_pairs)
def SpMultiply(A, B): # assert A.n == B.n sijv = [('a', ijv[0][0], ijv[0][1], ijv[1]) for ijv in A.ijv] sijv += [('b', ijv[0][0], ijv[0][1], ijv[1]) for ijv in B.ijv] map_reducer = mapreduce() print('MapReduce input:') pprint(sijv) matrix_multi = map_reducer.parallelize(sijv, 128) \ .flatMap(mapper1) \ .reduceByKey(reducer) \ .flatMap(mapper2) \ .reduceByKey(reducer) print('MapReduce Output:') final_matrix = [] for item in matrix_multi.collect(): print(item) final_matrix.append(item) return SpMatrix(A.m, B.n, final_matrix)
def main(): args = parse_args() raw_zones, aps = parse_grouped_aps(args.g) zones = {} for zone in raw_zones: for ap in zone['aps']: zones[ap] = str(zone['zone']) data = parse_data_files(aps, args.d, zones) print "Num Records: ", len(data) sc = mapreduce() #sc = SparkContext(appName="CountsApp") result = (sc.parallelize( data, 128).map(mapper0).reduceByKey(reducer).flatMap(mapper1).reduceByKey( reducer).map(mapper2).reduceByKey(reducer).collect()) zone_counts = {} for zone, counts in result: zone_counts[zone] = counts for i in range(NUM_DAYS): day_start = START_TIME + DAY * i date_elem = date.fromtimestamp(day_start) subset = defaultdict(dict) subset["interval"] = COUNT_INTERVAL subset["start_time"] = day_start for zone in zone_counts: counts = zone_counts[zone] for j in range(int(DAY / COUNT_INTERVAL)): interval = day_start + j * COUNT_INTERVAL if interval in counts: subset[zone][interval] = counts[interval] day = str(date_elem.day) month = str(date_elem.month) year = str(date_elem.year) with open(args.o + month + '-' + day + '-' + year[2:] + '.json', 'w') as outfile: json.dump(subset, outfile, indent=4, encoding='latin1')
def run_mapreduce(self): ''' Parameters: Input: None Output: Clusters Purpose: This function fetches all the data from the database. From here it formats in a way that will run on mapreduce. It then runs the maps and reducers the amount of iterations you set. It then returns the clusters ''' conn = sqlite3.connect('data/playlist_data.db') conn.text_factory = lambda x: str(x, 'latin1') c = conn.cursor() c.execute("""select * from songs;""") d = c.fetchall() data = [] for i in range(len(d)): key = [ d[i][0], d[i][1], d[i][2], d[i][3], d[i][4], d[i][5], d[i][6], d[i][7] ] values = [] for y in range(8, len(d[i])): values.append(d[i][y]) data.append([key, values]) for i in range(len(data)): data[i] = [(0), data[i]] data2 = copy.deepcopy(data) for i in range(self._numClusters): self._centroidRandomNodes.append([(i), data2[i][1][1]]) sc = mapreduce() output = [] new_data = data result = [] for i in range(self._numIterations): result = (sc.parallelize(data,128).map(self.mapper1) \ .reduceByKey(self.reducer) \ .map(self.mapper2) \ .collect()) sc.stop() return result
def main(): # Get the ratings from ratings.csv ratings = read_netflix_ratings() # Initialize MapReduce map_reducer = mapreduce() pipeline = map_reducer.parallelize(ratings, 128) # DO NOT MODIFY THIS! similar_table = pipeline.map(mapper1) \ .reduceByKey(reducer) \ .flatMap(mapper2) \ .reduceByKey(reducer) \ .flatMap(mapper3) \ .reduceByKey(reducer) \ .flatMap(mapper4) \ .reduceByKey(reducer) \ .flatMap(mapper5) for item in similar_table.collect(): print(item)
def main(): args = parse_args() with open(args.a, 'r') as infile: aps = [line.rstrip() for line in infile] data = parse_data_files(aps, args.d) print "Num records: ", len(data) sc = mapreduce() # sc = SparkContext(appName="NetflixProblemApp") similarities_result = ( sc.parallelize(data, 128).map(mapper0).reduceByKey(reducer).flatMap( mapper1).reduceByKey(reducer).map(mapper2) #.sortBy(lambda x: (x[0][0], x[0][1])).collect()) .sortBy(lambda x: (x[1])).collect()) sc.stop() with open(args.o or dirname(realpath(__file__)) + '/connections.json', 'w') as outfile: json.dump(similarities_result, outfile, indent=4, encoding='latin1')
def main(): archivos = ['archivo1.txt', 'archivo2.txt', 'archivo3.txt'] res = mapreduce.mapreduce(archivos, f_map, f_reduce) mapreduce.printb(res)
def configure(self, env): import params env.set_params(params) mapreduce(name="historyserver")
def configure(self, env): import params env.set_params(params) mapreduce(name="historyserver")
word_op.append((word, 1)) return word_op def reducer_word_count(input): return sum(input) def mapper_inverted_index(input, doc_id): import string input = input.translate(str.maketrans('', '', string.punctuation)) input = input.split(" ") word_op = [] for word in input: if word != "": word_op.append((word + "_" + doc_id, 1)) return word_op def reducer_inverted_index(input): return sum(input) from mapreduce import mapreduce input_location = r"./Input" run_object = mapreduce(num_mappers=5, num_reducers=5) run_object.run(input_location, mapper_word_count, reducer_word_count)
def configure(self, env): import params env.set_params(params) mapreduce()
def configure(self, env): import params env.set_params(params) mapreduce()
from mapreduce.config import MapReduce, Mapper, Reducer from mapreduce import mapreduce import sys class WC_Mapper(Mapper): def map(self, key, value): res = [] for word in value.split(' '): res.append((word, 1)) return res class WC_Reducer(Reducer): def reduce(self, key, value): res = 0 for elem in value: res += int(elem) return res if __name__ == "__main__": out = sys.argv[1] wc_m = WC_Mapper() wc_r = WC_Reducer() config = MapReduce('./input', wc_m, wc_r, out) mapreduce(config)
def configure(self, env): import params env.set_params(params) mapreduce(name="jobtracker")