def reducer(klv, params): """Reducer of the MapReduce framework that builds the Counts matrix. This returns just the counts for each token, tweet id pair. We build the actual matrix after we get the results from the reducer. Args: klv: The key and the list of values given as input to the reducer. params: The parameters supplied while initializing the MapReduce. """ from disco.util import kvgroup trained_vectorizer = params.trained_vectorizer for token, count_tuple_list in kvgroup(sorted(klv)): token = token.strip() j = trained_vectorizer.vocabulary_.get(''.join(token.split())) if j is not None: new_dict = {} for doc_id, count in count_tuple_list: if doc_id not in new_dict: new_dict[doc_id] = count else: new_dict[doc_id] += count yield j, new_dict
def mean_vs(iter, *args): from disco.util import kvgroup for k, vs in kvgroup(sorted(iter)): total = 0. for n, v in enumerate(vs): total += float(v) yield k, total / n
def reduce(row_iter, params): from disco.util import kvgroup for key, vals in kvgroup(sorted(row_iter)): vals = list(vals) user = None for v1 in vals: if v1["type"] == "User": user = v1 if user: for v1 in vals: if v1["type"] == "Edge": source_age = v1["source_attr"]["age"] source_gender = v1["source_attr"]["gender"] source_income = v1["source_attr"]["income"] target_age = user["age"] target_gender = user["gender"] target_income = user["income"] exp_id = v1["exp_id"] day = v1["day"] location = v1["location"] duration = v1["duration"] yield (source_age, source_gender, source_income, target_age, target_gender, target_income, location, duration, day, exp_id), ""
def reduce(row_iter, params): from disco.util import kvgroup for key, vals in kvgroup(sorted(row_iter)): vals = list(vals) user = None for v1 in vals: if v1["type"] == "User": user = v1 break if user: for v1 in vals: if v1["type"] == "Edge": yield ( v1["source_attr"]["age"], v1["source_attr"]["gender"], v1["source_attr"]["income"], user["age"], user["gender"], user["income"], v1["location"], v1["duration"], v1["day"], v1["exp_id"], ), ""
def reduce(self, rows_iter, out, params): from disco.util import kvgroup from itertools import chain #for url_key, descriptors in kvgroup(sorted(rows_iter)): for url_key, descriptors in kvgroup(rows_iter): merged_descriptors = list(chain.from_iterable(descriptors)) if len(merged_descriptors) > 1: out.add(url_key, merged_descriptors)
def reduce(iter, params): from disco.util import kvgroup for year, dance_values in kvgroup(sorted(iter)): l_dance_values = list(dance_values) length = float(len(l_dance_values)) #yield year, sum([float(x) for x in l_dance_values]) / length #yield sum(l_dance_values), length yield year, str(l_dance_values)
def reduce(iter, params): from disco.util import kvgroup, ilen for year, bid_prices in kvgroup(sorted(iter)): #avg = sum(bid_prices) /sum(1 for j in bid_prices ) bd = [i for i in bid_prices] #yield year, sum([i for i in bid_prices])/ len([i for i in bid_prices]) yield year, sum(bd)/ len(bd)
def reduce(self, rows_iter, out, params): """ Reduce: Sort all (line, 1) tuples and tally duplicate lines. """ for (line, count) in kvgroup(sorted(rows_iter)): out.add(line, sum(count)) return None
def reduce(self, row_iter, out, params): from disco.util import kvgroup from itertools import chain for key, vals in kvgroup(sorted(row_iter)): vals = list(vals) for v1 in vals: for v2 in vals: if v1["type"] == "EX" and v2["type"] == "EG": out.add(key, v1["val"] + v2["val"])
def reduce(iter, params): # TODO: Implement this as a map instead to maximize parallelism and locality for sentence, wordcounts in kvgroup(sorted(iter)): unionized_wordcounts = real_reduce(combine, wordcounts, {}) assert unionized_wordcounts is not None, "Wordcounts were None in the reduce method: %s %s" % ( combine, list(wordcounts)) yield tuple([ word if unionized_wordcounts.has_key(word) else None for word in sentence.split() ]), 1
def featurize_reduce(iter, params): """Generate features as reduce step in Disco's map-reduce. Generator. Implementation of reduce stage in map-reduce process, for model prediction feature generation of time series data. This function is never directly called, but rather passed as a parameter to the Disco `Job()` object's `run()` method. Parameters ---------- iter : iterable Iterable of tuples each containing the file name of a time series data file to be used for featurization, and the associated class or type name. params : dict Dictionary of parameters for use in map-reduce process. Yields ------ tuple A two-element tuple containing the file name of the time series data set, and dict of the extracted features. """ from disco.util import kvgroup import ntpath from mltsp import featurize from mltsp import cfg for fname, class_name in kvgroup(sorted(iter)): if fname[:7] == "file://": fname = fname.replace("file://", "") class_names = [] for classname in class_name: class_names.append(classname) if len(class_names) == 1: class_name = str(class_names[0]) elif len(class_names) == 0: yield "", "" else: class_name = str(class_names[0]) short_fname = os.path.splitext(ntpath.basename(fname))[0].split("$")[0] path_to_csv = os.path.join(params['tmp_dir_path'], fname) if os.path.exists(path_to_csv): print("Extracting features for " + fname) all_features = featurize.featurize_tsdata_object( path_to_csv, short_fname, params['custom_script_path'], params['fname_class_dict_2'], params['features_to_use']) all_features["class"] = class_name yield short_fname, all_features else: print("*" * 10 + " " + path_to_csv + " doesn't exist on the disk.") yield "", ""
def reduce(iter, params): last_word = None for (word, weight), data in kvgroup(sorted(iter)): if weight==0: wordcount = list(data) wordcounts = {word: sum([int(count) for count in wordcount[0]])} last_word = word elif weight==1 and word==last_word: sentences = list(data) for sentence in sentences: yield sentence, wordcounts
def reduce(self, rows_iter, out, params): """ Reduce: Sort all (word, 1) tuples then tally. """ # kvgroup requires consecutive keys that compare as equal # in order to combine values. for (word, count) in kvgroup(sorted(rows_iter)): out.add(word, sum(count)) return None
def reduce(iter, params): from disco.util import kvgroup for dn, metrics in kvgroup(sorted(iter)): dataset = {} for metric in metrics: for name, val in metric.iteritems(): if dataset.has_key(name): dataset[name].append(val) else: dataset[name] = [] yield dn, dataset
def reduce(iter, params): for doi, nones in kvgroup(iter): try: yield doi, metadata.get(doi) except db.NotFound: try: yield doi, metadata.fetch(doi) except CommError, exc: yield 'error', str(exc) # CommError has useless repr except Exception, exc: yield 'error', repr(exc)
def reduce_mse(interface, state, label, inp): from disco.util import kvgroup # function for grouping values by key out = interface.output(0) # all outputted pairs have the same output label number_of_samples, mse = 0, 0 for key, value in kvgroup(inp): # input pairs are sorted and grouped by key value = list(value) mse += (float(value[0]) - float(value[1])) ** 2 number_of_samples += 1 out.add("MSE", mse / float(number_of_samples))
def reduce(row_iter, params): from disco.util import kvgroup for key, vals in kvgroup(sorted(row_iter)): vals = list(vals) for v1 in vals: if v1["type"] == "User": yield key, v1 for v2 in vals: if v1["type"] == "Edge" and v2["type"] == "User": v1["source_attr"] = v2 yield v1["target"], v1
def reduce(iter, params): from disco.util import kvgroup for nearest_minute, queries in kvgroup(sorted(iter)): results = { 'nerd' : 0, 'sex' : 0, 'travel': 0, 'cooking': 0, } for result_dict in queries: for key in results.keys(): results[key] += result_dict[key] yield nearest_minute, results
def reduce_mse(interface, state, label, inp): from disco.util import kvgroup # function for grouping values by key out = interface.output(0) # all outputted pairs have the same output label number_of_samples, mse = 0, 0 for key, value in kvgroup( inp): # input pairs are sorted and grouped by key value = list(value) mse += (float(value[0]) - float(value[1]))**2 number_of_samples += 1 out.add("MSE", mse / float(number_of_samples))
def reduce_ca(interface, state, label, inp): from disco.util import kvgroup # function for grouping values by key out = interface.output(0) # all outputted pairs have the same output label number_of_samples, correct_predictions = 0, 0 for key, value in kvgroup(inp): # input pairs are sorted and grouped by key value = list(value) if value[0] == value[1]: correct_predictions += 1 number_of_samples += 1 out.add("CA", correct_predictions / float(number_of_samples))
def receive_score(iter, params): from disco.util import kvgroup d = params.damping_factor for node_id, vals in kvgroup(sorted(iter)): sum_v = 0 neighbors = None for t, v in vals: if t == "s": sum_v += v else: neighbors = v score = 1 - d + d * sum_v yield node_id, str(node_id) + " " + str(score) + " " + neighbors
def reduce(self, rows_iter, out, params): from disco.util import kvgroup final = {} for key, result in kvgroup(rows_iter): if key not in final: final[key] = [] for line in result: for value in range(len(line)): if len(final[key]) <= value: final[key].append(line[value]) else: final[key][value] += line[value] out.add(final, "a")
def reduce(iter, params): partitions = params['partitions'] name = params['name'] discodb = DiscoDB(kvgroup(iter)) try: # figure out what partition we are in key = discodb.keys().__iter__().next() partition = util.default_partition(key, partitions, params) discodb.dump(open(filename(name, partition), 'w')) yield partition, None except StopIteration: # no keys, nothing to write pass
def reduce(iter, params): last_word = None for (word, weight), data in kvgroup(sorted(iter)): if weight == 0: wordcount = list(data) wordcounts = { word: sum([int(count) for count in wordcount[0]]) } last_word = word elif weight == 1 and word == last_word: sentences = list(data) for sentence in sentences: yield sentence, wordcounts
def reduce(iter, params): from disco.util import kvgroup for key, counts in kvgroup(sorted(iter)): Day = '' Num = 0 DayList = list(counts) Days = set(DayList) for j in Days: if DayList.count(j) > Num: Num = DayList.count(j) Day = j if Num > 1: yield key, Day
def reduce_ca(interface, state, label, inp): from disco.util import kvgroup # function for grouping values by key out = interface.output(0) # all outputted pairs have the same output label number_of_samples, correct_predictions = 0, 0 for key, value in kvgroup( inp): # input pairs are sorted and grouped by key value = list(value) if value[0] == value[1]: correct_predictions += 1 number_of_samples += 1 out.add("CA", correct_predictions / float(number_of_samples))
def reduce_fit(interface, state, label, inp): import numpy as np from disco.util import kvgroup out = interface.output(0) A = [0 for i in range(len(state["X_indices"]) + 1)] for k, v in kvgroup(inp): ksplit = k.split(state["delimiter"]) if ksplit[1] == "A": A[int(ksplit[2])] = np.sum(v) else: b = np.sum(v) thetas = np.linalg.lstsq(A, b)[0] out.add(ksplit[0], (np.dot(state["samples"][ksplit[0]], thetas), thetas.tolist())) A = [0 for i in range(len(state["X_indices"]) + 1)]
def reduce(row_iter, params): from disco.util import kvgroup for key, vals in kvgroup(sorted(row_iter)): vals = list(vals) user = None for v1 in vals: if v1["type"] == "User": user = v1 yield key, v1 break if user: for v1 in vals: if v1["type"] == "Edge": v1["source_attr"] = user yield v1["target"], v1
def reduce_fit(interface, state, label, inp): import numpy as np from disco.util import kvgroup out = interface.output(0) A = [0 for i in range(len(state["X_indices"]) + 1)] for k, v in kvgroup(inp): ksplit = k.split(state["delimiter"]) if ksplit[1] == "A": A[int(ksplit[2])] = np.sum(v) else: b = np.sum(v) thetas = np.linalg.lstsq(A, b)[0] out.add( ksplit[0], (np.dot(state["samples"][ksplit[0]], thetas), thetas.tolist())) A = [0 for i in range(len(state["X_indices"]) + 1)]
def reduce(iter, out, params): import numpy as np import ftplib,os import iopro,shutil from disco.util import kvgroup for date, WeatherDateStat in kvgroup(iter): print date # print 'Connecting to NOAA...' ftp = ftplib.FTP('ftp.ncdc.noaa.gov') # print 'Succesfully Connected...' ftp.login() avg_temp = [] stdev = 0 SUM = 0 mean = 0 path = '/tmp/weather_files/'+str(date)+'/' if not os.path.exists(path): os.makedirs(path) for file in WeatherDateStat: cache = open(path+file.split('/')[-1],'wb') # print file try: ftp.retrbinary("RETR " + file, cache.write, 8*1024) except: ftp = ftplib.FTP('ftp.ncdc.noaa.gov') # 'Succesfully Connected...' ftp.login() ftp.retrbinary("RETR " + file, cache.write, 8*1024) cache.close() adapter = iopro.text_adapter(cache.name,compression='gzip',parser='csv', field_names=True) avg_temp = avg_temp + list(adapter[:]['TEMP']) # mean = (mean+adapter[:]['TEMP'].mean())/2.0 # stdev = np.sqrt(stdev**2+adapter[:]['TEMP'].std()**2)/2.0 adapter.close() print 'Date Mean Std: ', date, np.mean(avg_temp), np.std(avg_temp) out.add(date, (np.mean(avg_temp),np.std(avg_temp)))
def reduce(iter, params): def mymin(a, b): mins = [x for x in (a, b) if x != -1] if not mins: return -1 return min(mins) from disco.util import kvgroup for node, distances in kvgroup(sorted(iter)): nodes = [] distances = list(distances) newdistances = {} def minFrom(d, a): for k, v in a.items(): d[k] = mymin(d.get(k, -1), v) for d in distances: if d.get("nodes"): nodes = d["nodes"] minFrom(newdistances, d["distances"]) yield node, json.dumps([node, newdistances, nodes])
def reduce(iter, params): def mymin(a, b): mins = [x for x in (a,b) if x != -1] if not mins: return -1 return min(mins) from disco.util import kvgroup for node, distances in kvgroup(sorted(iter)): nodes = [] distances = list(distances) newdistances = {} def minFrom(d, a): for k, v in a.items(): d[k] = mymin(d.get(k, -1), v) for d in distances: if d.get("nodes"): nodes = d["nodes"] minFrom(newdistances, d["distances"]) yield node, json.dumps([node,newdistances,nodes])
def reduce(pos_iter, out, params): from disco.util import kvgroup for pos, counts in kvgroup(sorted(pos_iter)): out.add(pos, sum(counts))
def sum_vs(iter, *args): from disco.util import kvgroup for k, vs in kvgroup(sorted(iter)): yield k, sum(vs)
def count_ks(iter, *args): from disco.util import kvgroup yield sum(1 for kvs in kvgroup(sorted(iter))), None
def count_vs(iter, *args): from disco.util import kvgroup for k, vs in kvgroup(sorted(iter)): yield k, sum(1 for v in vs)
def Reduce(interface, state, label, inp): out = interface.output(0) for k, vs in kvgroup(inp): out.add((base64.decodestring(k)), len(list(vs)))
def Reduce(interface, state, label, inp): out = interface.output(0) for k, vs in kvgroup(inp): out.add(str_to_bytes(k), 0)
def reduce(iter, params): for word, counts in kvgroup(sorted(iter)): yield word, str(sum(counts))
def featurize_reduce(iter, params): '''Generator, implementation of reduce stage in map-reduce process, for feature generation of time series data. iter is an iterable of tuples containing the file name of a time series data file to be used for featurization, and the associated class or type name. Yields a two-element tuple containing the file name of the time series data set, and dict of the extracted features. ''' from disco.util import kvgroup for fname, class_name in kvgroup(sorted(iter)): class_names = [] for classname in class_name: class_names.append(classname) if len(class_names) == 1: class_name = str(class_names[0]) elif len(class_names) == 0: print "CLASS_NAMES: " + str( class_names) + "\n" + "CLASS_NAME: " + str(class_name) yield "", "" else: print "CLASS_NAMES: " + str( class_names) + "\n" + "CLASS_NAME: " + str( class_name) + " - Choosing first class name in list." class_name = str(class_names[0]) print "fname: " + fname + ", class_name: " + class_name import os import sys PATH_TO_PROJECT_DIRECTORY = os.path.join(os.path.expanduser("~"), "Dropbox/work_etc/mlweb") sys.path.append(PATH_TO_PROJECT_DIRECTORY) import cfg sys.path.append(cfg.TCP_INGEST_TOOLS_PATH) import generate_science_features import build_rf_model import lc_tools import custom_feature_tools as cft short_fname = fname.split("/")[-1].replace( ("." + fname.split(".")[-1] if "." in fname.split("/")[-1] else ""), "") path_to_csv = os.path.join(cfg.UPLOAD_FOLDER, os.path.join("unzipped", fname)) all_features = {} print "path_to_csv: " + path_to_csv if os.path.isfile(path_to_csv): print "Extracting features for " + fname ## generate features: if len( list( set(params['features_to_use']) & set(cfg.features_list))) > 0: timeseries_features = lc_tools.generate_timeseries_features( path_to_csv, classname=class_name, sep=',') else: timeseries_features = {} if len( list( set(params['features_to_use']) & set(cfg.features_list_science))) > 0: science_features = generate_science_features.generate( path_to_csv=path_to_csv) else: science_features = {} if params['custom_script_path']: custom_features = cft.generate_custom_features( custom_script_path=params['custom_script_path'], path_to_csv=path_to_csv, features_already_known=dict( timeseries_features.items() + science_features.items() + (params['meta_features'][fname].items() if fname in params['meta_features'] else {}.items()))) else: custom_features = {} all_features = dict(timeseries_features.items() + science_features.items() + custom_features.items() + [("class", class_name)]) else: print fname + " is not a file." yield "", "" yield short_fname, all_features
def test_map(self): input = range(10 * self.num_workers) self.job = OnlyMapJob().run(input=self.test_server.urls(input)) results = kvgroup(sorted(self.results(self.job))) self.assertAllEqual(((k, sum(vs)) for k, vs in results), ((i, 10) for i in input))
def reduce(iter, params): from disco.util import kvgroup for extension, ratios in kvgroup(sorted(iter)): l_ratios = [r for r in ratios] yield extension, sum(l_ratios) / len(l_ratios)
def reduce(iter, params): for k, vs in kvgroup(iter): yield base64.decodestring(k), len(list(vs))
def reduce(iter, params): for k, vs in kvgroup(sorted(iter)): yield k, sum(int(v) for v in vs)
def pred_featurize_reduce(iter, params): '''Generator, implementation of reduce stage in map-reduce process, for model prediction feature generation of time series data. iter is an iterable of tuples containing the file name of a time series data file to be used for featurization, and an unused placeholder string. Yields a two-element tuple containing the file name of the time series data set, and a two-element list containing the extracted features and the original time series data. ''' from copy import deepcopy featset_key = params['featset_key'] sep = params['sep'] custom_features_script = params['custom_features_script'] meta_features = params['meta_features'] import sys, os from disco.util import kvgroup import os import sys PATH_TO_PROJECT_DIRECTORY = os.path.join(os.path.expanduser("~"), "Dropbox/work_etc/mlweb") sys.path.append(PATH_TO_PROJECT_DIRECTORY) import cfg sys.path.append(cfg.TCP_INGEST_TOOLS_PATH) import generate_science_features import predict_class as predict import build_rf_model import lc_tools import custom_feature_tools as cft for fname, junk in kvgroup(sorted(iter)): if os.path.isfile(fname): f = open(fname) elif os.path.isfile(os.path.join(cfg.UPLOAD_FOLDER, fname)): f = open(os.path.join(cfg.UPLOAD_FOLDER, fname)) else: print(fname if cfg.UPLOAD_FOLDER in fname else os.path.join( cfg.UPLOAD_FOLDER, fname)) + " is not a file..." if os.path.exists(os.path.join(cfg.UPLOAD_FOLDER, fname)) or os.path.exists(fname): print "But it does exist on the disk." else: print "and in fact it doesn't even exist." continue lines = f.readlines() f.close() ts_data = [] for i in range(len(lines)): ts_data.append(lines[i].strip("\n").strip().split(sep)) if len(ts_data[i]) < len(lines[i].strip("\n").strip().split(",")): ts_data[i] = lines[i].strip("\n").strip().split(",") if len(ts_data[i]) < len(lines[i].strip("\n").strip().split(" ")): ts_data[i] = lines[i].strip("\n").strip().split(" ") if len(ts_data[i]) < len(lines[i].strip("\n").strip().split("\t")): ts_data[i] = lines[i].strip("\n").strip().split("\t") for j in range(len(ts_data[i])): ts_data[i][j] = float(ts_data[i][j]) del lines f = open( os.path.join(cfg.FEATURES_FOLDER, "%s_features.csv" % featset_key)) features_in_model = f.readline().strip().split(',') f.close() features_to_use = features_in_model ## generate features: if len(list(set(features_to_use) & set(cfg.features_list))) > 0: timeseries_features = lc_tools.generate_timeseries_features( deepcopy(ts_data), sep=sep, ts_data_passed_directly=True) else: timeseries_features = {} if len(list(set(features_to_use) & set(cfg.features_list_science))) > 0: science_features = generate_science_features.generate( ts_data=deepcopy(ts_data)) else: science_features = {} if custom_features_script: custom_features = cft.generate_custom_features( custom_script_path=custom_features_script, path_to_csv=None, features_already_known=dict(timeseries_features.items() + science_features.items() + meta_features.items()), ts_data=deepcopy(ts_data)) else: custom_features = {} all_features = dict(timeseries_features.items() + science_features.items() + custom_features.items() + meta_features.items()) yield fname, [all_features, ts_data]
def reduce(iter, params): from disco.util import kvgroup for char, counts in kvgroup(sorted(iter)): yield char, sum(counts)
def fun_reduce(iter, params): for k, v in kvgroup(sorted(iter)) yield k, sorted(v)
def reduce(iter, params): from disco.util import kvgroup for age, counts in kvgroup(sorted(iter)): yield age, sum(counts)
def reduce(dt_iter, out, params): from disco.util import kvgroup for word, counts in kvgroup(sorted(dt_iter)): out.add(word, sum(counts))
def reduce(iter, params): from disco.util import kvgroup for year, bid_prices in kvgroup(sorted(iter)): bd = [i for i in bid_prices] yield year, sum(bd) / len(bd)