Python kvgroupの例、disco.util.kvgroup Pythonの例

コード例 #1

0

ファイルを表示

ファイル: map_reduce.py プロジェクト: rupenp/sentiment-analyzer

def reducer(klv, params):
    """Reducer of the MapReduce framework that builds the Counts matrix.

    This returns just the counts for each token, tweet id pair. We build
    the actual matrix after we get the results from the reducer.
    
    Args:
        klv: The key and the list of values given as input to the reducer.
        params: The parameters supplied while initializing the MapReduce.
    """
    from disco.util import kvgroup

    trained_vectorizer = params.trained_vectorizer

    for token, count_tuple_list in kvgroup(sorted(klv)):
        token = token.strip()
        j = trained_vectorizer.vocabulary_.get(''.join(token.split()))
        if j is not None:
            new_dict = {}
            for doc_id, count in count_tuple_list:
                if doc_id not in new_dict:
                    new_dict[doc_id] = count
                else:
                    new_dict[doc_id] += count
            yield j, new_dict

コード例 #2

0

ファイルを表示

ファイル: func.py プロジェクト: discoproject/discodex

def mean_vs(iter, *args):
    from disco.util import kvgroup
    for k, vs in kvgroup(sorted(iter)):
        total = 0.
        for n, v in enumerate(vs):
            total += float(v)
        yield k, total / n

コード例 #3

0

ファイルを表示

ファイル: process_add_userinfo_3.py プロジェクト: Tskatom/Flu-Outbreak-Predicting

    def reduce(row_iter, params):
        from disco.util import kvgroup
        
        for key, vals in kvgroup(sorted(row_iter)):
            vals = list(vals)
            user = None
            for v1 in vals:
                if v1["type"] == "User":
                    user = v1
            if user:
                for v1 in vals:
                    if v1["type"] == "Edge":
                        source_age = v1["source_attr"]["age"]
                        source_gender = v1["source_attr"]["gender"]
                        source_income = v1["source_attr"]["income"]
                        target_age = user["age"]
                        target_gender = user["gender"]
                        target_income = user["income"]
                        exp_id = v1["exp_id"]
                        day = v1["day"]
                        location = v1["location"]
                        duration = v1["duration"]

                        yield (source_age, source_gender, source_income, 
                               target_age, target_gender, target_income,
                               location, duration, day, exp_id), ""

コード例 #4

0

ファイルを表示

ファイル: process_add_userinfo.py プロジェクト: Tskatom/Flu-Outbreak-Predicting

    def reduce(row_iter, params):
        from disco.util import kvgroup

        for key, vals in kvgroup(sorted(row_iter)):
            vals = list(vals)
            user = None
            for v1 in vals:
                if v1["type"] == "User":
                    user = v1
                    break
            if user:
                for v1 in vals:
                    if v1["type"] == "Edge":
                        yield (
                            v1["source_attr"]["age"],
                            v1["source_attr"]["gender"],
                            v1["source_attr"]["income"],
                            user["age"],
                            user["gender"],
                            user["income"],
                            v1["location"],
                            v1["duration"],
                            v1["day"],
                            v1["exp_id"],
                        ), ""

コード例 #5

0

ファイルを表示

ファイル: simple_innerjoin.py プロジェクト: chinnurtb/disco_playground

 def reduce(self, rows_iter, out, params):
     from disco.util import kvgroup
     from itertools import chain
     #for url_key, descriptors in kvgroup(sorted(rows_iter)):
     for url_key, descriptors in kvgroup(rows_iter):
         merged_descriptors = list(chain.from_iterable(descriptors))
         if len(merged_descriptors) > 1:
             out.add(url_key, merged_descriptors)

コード例 #6

0

ファイルを表示

ファイル: danceability_by_year.py プロジェクト: joelcrocker/hackreduce

def reduce(iter, params):
    from disco.util import kvgroup
    for year, dance_values in kvgroup(sorted(iter)):
        l_dance_values = list(dance_values)
        length = float(len(l_dance_values))
        #yield year, sum([float(x) for x in l_dance_values]) / length
        #yield sum(l_dance_values), length
        yield year, str(l_dance_values)

コード例 #7

0

ファイルを表示

ファイル: treasury_yield.py プロジェクト: dcrosta/mongo-disco

def reduce(iter, params):
    from disco.util import kvgroup, ilen
    for year, bid_prices in kvgroup(sorted(iter)):
        #avg = sum(bid_prices) /sum(1 for j in bid_prices )

        bd = [i for i in bid_prices]
        #yield year, sum([i for i in bid_prices])/ len([i for i in bid_prices])
        yield year, sum(bd)/ len(bd)

コード例 #8

0

ファイルを表示

ファイル: sort_mapr.py プロジェクト: stharrold/ARCHIVED_bench_mapr

    def reduce(self, rows_iter, out, params):
        """
        Reduce:
        Sort all (line, 1) tuples and tally duplicate lines.
        """

        for (line, count) in kvgroup(sorted(rows_iter)):
            out.add(line, sum(count))
        return None

コード例 #9

0

ファイルを表示

    def reduce(self, rows_iter, out, params):
        """
        Reduce:
        Sort all (line, 1) tuples and tally duplicate lines.
        """

        for (line, count) in kvgroup(sorted(rows_iter)):
            out.add(line, sum(count))
        return None

コード例 #10

0

ファイルを表示

ファイル: merge_exp_edge_mr.py プロジェクト: Tskatom/Flu-Outbreak-Predicting

 def reduce(self, row_iter, out, params):
     from disco.util import kvgroup
     from itertools import chain
     for key, vals in kvgroup(sorted(row_iter)):
         vals = list(vals)
         for v1 in vals:
             for v2 in vals:
                 if v1["type"] == "EX" and v2["type"] == "EG":
                     out.add(key, v1["val"] + v2["val"])

コード例 #11

0

ファイルを表示

 def reduce(iter, params):
     # TODO: Implement this as a map instead to maximize parallelism and locality
     for sentence, wordcounts in kvgroup(sorted(iter)):
         unionized_wordcounts = real_reduce(combine, wordcounts, {})
         assert unionized_wordcounts is not None, "Wordcounts were None in the reduce method: %s %s" % (
             combine, list(wordcounts))
         yield tuple([
             word if unionized_wordcounts.has_key(word) else None
             for word in sentence.split()
         ]), 1

コード例 #12

0

ファイルを表示

ファイル: parallel_processing.py プロジェクト: gitter-badger/mltsp

def featurize_reduce(iter, params):
    """Generate features as reduce step in Disco's map-reduce.

    Generator. Implementation of reduce stage in map-reduce process,
    for model prediction feature generation of time series data.

    This function is never directly called, but rather passed as a
    parameter to the Disco `Job()` object's `run()` method.

    Parameters
    ----------
    iter : iterable
        Iterable of tuples each containing the file name of a time
        series data file to be used for featurization, and the
        associated class or type name.
    params : dict
        Dictionary of parameters for use in map-reduce process.

    Yields
    ------
    tuple
        A two-element tuple containing the file name of the time
        series data set, and dict of the extracted features.

    """
    from disco.util import kvgroup
    import ntpath
    from mltsp import featurize
    from mltsp import cfg

    for fname, class_name in kvgroup(sorted(iter)):
        if fname[:7] == "file://":
            fname = fname.replace("file://", "")
        class_names = []
        for classname in class_name:
            class_names.append(classname)
        if len(class_names) == 1:
            class_name = str(class_names[0])
        elif len(class_names) == 0:
            yield "", ""
        else:
            class_name = str(class_names[0])

        short_fname = os.path.splitext(ntpath.basename(fname))[0].split("$")[0]
        path_to_csv = os.path.join(params['tmp_dir_path'], fname)
        if os.path.exists(path_to_csv):
            print("Extracting features for " + fname)
            all_features = featurize.featurize_tsdata_object(
                path_to_csv, short_fname, params['custom_script_path'],
                params['fname_class_dict_2'], params['features_to_use'])
            all_features["class"] = class_name
            yield short_fname, all_features
        else:
            print("*" * 10 + " " + path_to_csv + " doesn't exist on the disk.")
            yield "", ""

コード例 #13

0

ファイルを表示

ファイル: dslct_jobs.py プロジェクト: JensRantil/disco-slct

        def reduce(iter, params):
		last_word = None
		for (word, weight), data in kvgroup(sorted(iter)):
			if weight==0:
				wordcount = list(data)
				wordcounts = {word: sum([int(count) for count in wordcount[0]])}
				last_word = word
			elif weight==1 and word==last_word:
				sentences = list(data)
				for sentence in sentences:
					yield sentence, wordcounts

コード例 #14

0

ファイルを表示

ファイル: count_words_mapr.py プロジェクト: corersky/ARCHIVED_bench_mapr

    def reduce(self, rows_iter, out, params):
        """
        Reduce:
        Sort all (word, 1) tuples then tally.
        """

        # kvgroup requires consecutive keys that compare as equal
        # in order to combine values.
        for (word, count) in kvgroup(sorted(rows_iter)):
            out.add(word, sum(count))
        return None

コード例 #15

0

ファイルを表示

ファイル: rrdb_map.py プロジェクト: usingsystem/errdb-map-reduce

def reduce(iter, params):
	from disco.util import kvgroup
	for dn, metrics in kvgroup(sorted(iter)):
		dataset = {}
		for metric in metrics:
			for name, val in metric.iteritems():
				if dataset.has_key(name):
					dataset[name].append(val)
				else:
					dataset[name] = []
		yield dn, dataset

コード例 #16

0

ファイルを表示

ファイル: jobs.py プロジェクト: pombredanne/springer-analytics

 def reduce(iter, params):
     for doi, nones in kvgroup(iter):
         try:
             yield doi, metadata.get(doi)
         except db.NotFound:
             try:
                 yield doi, metadata.fetch(doi)
             except CommError, exc:
                 yield 'error', str(exc) # CommError has useless repr
             except Exception, exc:
                 yield 'error', repr(exc)

コード例 #17

0

ファイルを表示

ファイル: accuracy.py プロジェクト: romanorac/discomll

def reduce_mse(interface, state, label, inp):
    from disco.util import kvgroup  # function for grouping values by key
    out = interface.output(0)  # all outputted pairs have the same output label

    number_of_samples, mse = 0, 0

    for key, value in kvgroup(inp):  # input pairs are sorted and grouped by key
        value = list(value)
        mse += (float(value[0]) - float(value[1])) ** 2
        number_of_samples += 1

    out.add("MSE", mse / float(number_of_samples))

コード例 #18

0

ファイルを表示

ファイル: process_add_userinfo_2.py プロジェクト: Tskatom/Flu-Outbreak-Predicting

 def reduce(row_iter, params):
     from disco.util import kvgroup
     
     for key, vals in kvgroup(sorted(row_iter)):
         vals = list(vals)
         for v1 in vals:
             if v1["type"] == "User":
                 yield key, v1
             for v2 in vals:
                 if v1["type"] == "Edge" and v2["type"] == "User":
                     v1["source_attr"] = v2
                     yield v1["target"], v1

コード例 #19

0

ファイルを表示

ファイル: reducer.py プロジェクト: ashchristopher/HackReduceToronto

def reduce(iter, params):
    from disco.util import kvgroup
    for nearest_minute, queries in kvgroup(sorted(iter)):
        results = {
            'nerd' : 0,
            'sex' : 0,
            'travel': 0,
            'cooking': 0,
        }
        for result_dict in queries:
            for key in results.keys():
                results[key] += result_dict[key]
        yield nearest_minute, results

コード例 #20

0

ファイルを表示

ファイル: accuracy.py プロジェクト: sb123456789sb/discomll

def reduce_mse(interface, state, label, inp):
    from disco.util import kvgroup  # function for grouping values by key
    out = interface.output(0)  # all outputted pairs have the same output label

    number_of_samples, mse = 0, 0

    for key, value in kvgroup(
            inp):  # input pairs are sorted and grouped by key
        value = list(value)
        mse += (float(value[0]) - float(value[1]))**2
        number_of_samples += 1

    out.add("MSE", mse / float(number_of_samples))

コード例 #21

0

ファイルを表示

ファイル: accuracy.py プロジェクト: romanorac/discomll

def reduce_ca(interface, state, label, inp):
    from disco.util import kvgroup  # function for grouping values by key
    out = interface.output(0)  # all outputted pairs have the same output label

    number_of_samples, correct_predictions = 0, 0

    for key, value in kvgroup(inp):  # input pairs are sorted and grouped by key
        value = list(value)
        if value[0] == value[1]:
            correct_predictions += 1
        number_of_samples += 1

    out.add("CA", correct_predictions / float(number_of_samples))

コード例 #22

0

ファイルを表示

ファイル: page_rank.py プロジェクト: nagyistge/discoproject.org-disco

def receive_score(iter, params):
    from disco.util import kvgroup
    d = params.damping_factor
    for node_id, vals in kvgroup(sorted(iter)):
        sum_v = 0
        neighbors = None
        for t, v in vals:
            if t == "s":
                sum_v += v
            else:
                neighbors = v
        score = 1 - d + d * sum_v
        yield node_id, str(node_id) + " " + str(score) + " " + neighbors

コード例 #23

0

ファイルを表示

ファイル: page_rank.py プロジェクト: AlexArgus/disco

def receive_score(iter, params):
    from disco.util import kvgroup
    d = params.damping_factor
    for node_id, vals in kvgroup(sorted(iter)):
        sum_v = 0
        neighbors = None
        for t, v in vals:
            if t == "s":
                sum_v += v
            else:
                neighbors = v
        score = 1 - d + d * sum_v
        yield node_id, str(node_id) + " " + str(score) + " " + neighbors

コード例 #24

0

ファイルを表示

ファイル: add.py プロジェクト: vtemian/kruncher

 def reduce(self, rows_iter, out, params):
   from disco.util import kvgroup
   final = {}
   for key, result in kvgroup(rows_iter):
     if key not in final:
        final[key] = []
     for line in result:
       for value in range(len(line)):
         if len(final[key]) <= value:
           final[key].append(line[value])
         else:
           final[key][value] += line[value]
   out.add(final, "a")

コード例 #25

0

ファイルを表示

ファイル: db.py プロジェクト: jamii/springer-analytics

 def reduce(iter, params):
     partitions = params['partitions']
     name = params['name']
     discodb = DiscoDB(kvgroup(iter))
     try:
         # figure out what partition we are in
         key = discodb.keys().__iter__().next()
         partition = util.default_partition(key, partitions, params)
         discodb.dump(open(filename(name, partition), 'w'))
         yield partition, None
     except StopIteration:
         # no keys, nothing to write
         pass

コード例 #26

0

ファイルを表示

 def reduce(iter, params):
     last_word = None
     for (word, weight), data in kvgroup(sorted(iter)):
         if weight == 0:
             wordcount = list(data)
             wordcounts = {
                 word: sum([int(count) for count in wordcount[0]])
             }
             last_word = word
         elif weight == 1 and word == last_word:
             sentences = list(data)
             for sentence in sentences:
                 yield sentence, wordcounts

コード例 #27

0

ファイルを表示

ファイル: vc_obs_best_week_day_for_billing_msisdn.py プロジェクト: miranetworks/ryanm_hackday_2013

def reduce(iter, params):
    from disco.util import kvgroup
    for key, counts in kvgroup(sorted(iter)):
        Day = ''
        Num = 0
        DayList = list(counts)
        Days = set(DayList)
        for j in Days:
            if DayList.count(j) > Num:
                Num = DayList.count(j)
                Day = j
        
        if Num > 1:
            yield key, Day

コード例 #28

0

ファイルを表示

ファイル: accuracy.py プロジェクト: sb123456789sb/discomll

def reduce_ca(interface, state, label, inp):
    from disco.util import kvgroup  # function for grouping values by key
    out = interface.output(0)  # all outputted pairs have the same output label

    number_of_samples, correct_predictions = 0, 0

    for key, value in kvgroup(
            inp):  # input pairs are sorted and grouped by key
        value = list(value)
        if value[0] == value[1]:
            correct_predictions += 1
        number_of_samples += 1

    out.add("CA", correct_predictions / float(number_of_samples))

コード例 #29

0

ファイルを表示

ファイル: locally_weighted_linear_regression.py プロジェクト: romanorac/discomll

def reduce_fit(interface, state, label, inp):
    import numpy as np
    from disco.util import kvgroup

    out = interface.output(0)
    A = [0 for i in range(len(state["X_indices"]) + 1)]
    for k, v in kvgroup(inp):
        ksplit = k.split(state["delimiter"])
        if ksplit[1] == "A":
            A[int(ksplit[2])] = np.sum(v)
        else:
            b = np.sum(v)
            thetas = np.linalg.lstsq(A, b)[0]
            out.add(ksplit[0], (np.dot(state["samples"][ksplit[0]], thetas), thetas.tolist()))
            A = [0 for i in range(len(state["X_indices"]) + 1)]

コード例 #30

0

ファイルを表示

ファイル: process_add_userinfo.py プロジェクト: Tskatom/Flu-Outbreak-Predicting

    def reduce(row_iter, params):
        from disco.util import kvgroup

        for key, vals in kvgroup(sorted(row_iter)):
            vals = list(vals)
            user = None
            for v1 in vals:
                if v1["type"] == "User":
                    user = v1
                    yield key, v1
                    break
            if user:
                for v1 in vals:
                    if v1["type"] == "Edge":
                        v1["source_attr"] = user
                        yield v1["target"], v1

コード例 #31

0

ファイルを表示

def reduce_fit(interface, state, label, inp):
    import numpy as np
    from disco.util import kvgroup

    out = interface.output(0)
    A = [0 for i in range(len(state["X_indices"]) + 1)]
    for k, v in kvgroup(inp):
        ksplit = k.split(state["delimiter"])
        if ksplit[1] == "A":
            A[int(ksplit[2])] = np.sum(v)
        else:
            b = np.sum(v)
            thetas = np.linalg.lstsq(A, b)[0]
            out.add(
                ksplit[0],
                (np.dot(state["samples"][ksplit[0]], thetas), thetas.tolist()))
            A = [0 for i in range(len(state["X_indices"]) + 1)]

コード例 #32

0

ファイルを表示

ファイル: WeatherDisco.py プロジェクト: pooya/Examples

    def reduce(iter, out, params):
        import numpy as np
        import ftplib,os
        import iopro,shutil
        from disco.util import kvgroup
        
        for date, WeatherDateStat in kvgroup(iter):
            print date
            # print 'Connecting to NOAA...'
            ftp = ftplib.FTP('ftp.ncdc.noaa.gov')
            # print 'Succesfully Connected...'
            ftp.login()
 
            avg_temp = []
            
            stdev = 0
            SUM = 0
            mean = 0

            path = '/tmp/weather_files/'+str(date)+'/'
 
            if not os.path.exists(path):
                os.makedirs(path)
            for file in WeatherDateStat:
                cache = open(path+file.split('/')[-1],'wb')
                # print file
                try:
                    ftp.retrbinary("RETR " + file, cache.write, 8*1024)
                except:
                    ftp = ftplib.FTP('ftp.ncdc.noaa.gov')
                    # 'Succesfully Connected...'
                    ftp.login()
                    ftp.retrbinary("RETR " + file, cache.write, 8*1024)
                   
                cache.close()
                adapter = iopro.text_adapter(cache.name,compression='gzip',parser='csv', field_names=True)
                avg_temp = avg_temp + list(adapter[:]['TEMP'])
                # mean = (mean+adapter[:]['TEMP'].mean())/2.0
                # stdev = np.sqrt(stdev**2+adapter[:]['TEMP'].std()**2)/2.0
                adapter.close()

            print 'Date Mean Std: ', date, np.mean(avg_temp), np.std(avg_temp)
            out.add(date, (np.mean(avg_temp),np.std(avg_temp)))

コード例 #33

0

ファイルを表示

def reduce(iter, params):
    def mymin(a, b):
        mins = [x for x in (a, b) if x != -1]
        if not mins:
            return -1
        return min(mins)

    from disco.util import kvgroup
    for node, distances in kvgroup(sorted(iter)):
        nodes = []
        distances = list(distances)
        newdistances = {}

        def minFrom(d, a):
            for k, v in a.items():
                d[k] = mymin(d.get(k, -1), v)

        for d in distances:
            if d.get("nodes"):
                nodes = d["nodes"]
            minFrom(newdistances, d["distances"])

        yield node, json.dumps([node, newdistances, nodes])

コード例 #34

0

ファイルを表示

ファイル: test.py プロジェクト: mmikulicic/simple-disco-example

def reduce(iter, params):
    def mymin(a, b):
        mins = [x for x in (a,b) if x != -1]
        if not mins:
            return -1
        return min(mins)

    from disco.util import kvgroup
    for node, distances in kvgroup(sorted(iter)):
        nodes = []
        distances = list(distances)
        newdistances = {}

        def minFrom(d, a):
            for k, v in a.items():
                d[k] = mymin(d.get(k, -1), v)
        
        for d in distances:
            if d.get("nodes"):
                nodes = d["nodes"]
            minFrom(newdistances, d["distances"])

        yield node, json.dumps([node,newdistances,nodes])

コード例 #35

0

ファイルを表示

ファイル: MapReduce_CountWords_Chain.py プロジェクト: pooya/Examples

 def reduce(pos_iter, out, params):
     from disco.util import kvgroup
     for pos, counts in kvgroup(sorted(pos_iter)):
         out.add(pos, sum(counts))

コード例 #36

0

ファイルを表示

ファイル: func.py プロジェクト: discoproject/discodex

def sum_vs(iter, *args):
    from disco.util import kvgroup
    for k, vs in kvgroup(sorted(iter)):
        yield k, sum(vs)

コード例 #37

0

ファイルを表示

ファイル: func.py プロジェクト: discoproject/discodex

def count_ks(iter, *args):
    from disco.util import kvgroup
    yield sum(1 for kvs in kvgroup(sorted(iter))), None

コード例 #38

0

ファイルを表示

ファイル: func.py プロジェクト: discoproject/discodex

def count_vs(iter, *args):
    from disco.util import kvgroup
    for k, vs in kvgroup(sorted(iter)):
        yield k, sum(1 for v in vs)

コード例 #39

0

ファイルを表示

def Reduce(interface, state, label, inp):
    out = interface.output(0)
    for k, vs in kvgroup(inp):
        out.add((base64.decodestring(k)), len(list(vs)))

コード例 #40

0

ファイルを表示

ファイル: test_encode.py プロジェクト: nagyistge/discoproject.org-disco

def Reduce(interface, state, label, inp):
    out = interface.output(0)
    for k, vs in kvgroup(inp):
        out.add(str_to_bytes(k), 0)

コード例 #41

0

ファイルを表示

ファイル: wordcount_ddb.py プロジェクト: nagyistge/discoproject.org-disco

 def reduce(iter, params):
     for word, counts in kvgroup(sorted(iter)):
         yield word, str(sum(counts))

コード例 #42

0

ファイルを表示

def featurize_reduce(iter, params):
    '''Generator, implementation of reduce stage in map-reduce process, for feature generation of time series data. iter is an iterable of tuples containing the file name of a time series data file to be used for featurization, and the associated class or type name. Yields a two-element tuple containing the file name of the time series data set, and dict of the extracted features. 
	'''
    from disco.util import kvgroup

    for fname, class_name in kvgroup(sorted(iter)):
        class_names = []
        for classname in class_name:
            class_names.append(classname)
        if len(class_names) == 1:
            class_name = str(class_names[0])
        elif len(class_names) == 0:
            print "CLASS_NAMES: " + str(
                class_names) + "\n" + "CLASS_NAME: " + str(class_name)
            yield "", ""
        else:
            print "CLASS_NAMES: " + str(
                class_names) + "\n" + "CLASS_NAME: " + str(
                    class_name) + "  - Choosing first class name in list."
            class_name = str(class_names[0])

        print "fname: " + fname + ", class_name: " + class_name
        import os
        import sys
        PATH_TO_PROJECT_DIRECTORY = os.path.join(os.path.expanduser("~"),
                                                 "Dropbox/work_etc/mlweb")
        sys.path.append(PATH_TO_PROJECT_DIRECTORY)
        import cfg
        sys.path.append(cfg.TCP_INGEST_TOOLS_PATH)

        import generate_science_features
        import build_rf_model
        import lc_tools
        import custom_feature_tools as cft

        short_fname = fname.split("/")[-1].replace(
            ("." +
             fname.split(".")[-1] if "." in fname.split("/")[-1] else ""), "")
        path_to_csv = os.path.join(cfg.UPLOAD_FOLDER,
                                   os.path.join("unzipped", fname))
        all_features = {}
        print "path_to_csv: " + path_to_csv
        if os.path.isfile(path_to_csv):
            print "Extracting features for " + fname

            ## generate features:
            if len(
                    list(
                        set(params['features_to_use'])
                        & set(cfg.features_list))) > 0:
                timeseries_features = lc_tools.generate_timeseries_features(
                    path_to_csv, classname=class_name, sep=',')
            else:
                timeseries_features = {}
            if len(
                    list(
                        set(params['features_to_use'])
                        & set(cfg.features_list_science))) > 0:
                science_features = generate_science_features.generate(
                    path_to_csv=path_to_csv)
            else:
                science_features = {}
            if params['custom_script_path']:
                custom_features = cft.generate_custom_features(
                    custom_script_path=params['custom_script_path'],
                    path_to_csv=path_to_csv,
                    features_already_known=dict(
                        timeseries_features.items() +
                        science_features.items() +
                        (params['meta_features'][fname].items() if fname in
                         params['meta_features'] else {}.items())))
            else:
                custom_features = {}

            all_features = dict(timeseries_features.items() +
                                science_features.items() +
                                custom_features.items() +
                                [("class", class_name)])

        else:
            print fname + " is not a file."
            yield "", ""

        yield short_fname, all_features

コード例 #43

0

ファイルを表示

 def test_map(self):
     input = range(10 * self.num_workers)
     self.job = OnlyMapJob().run(input=self.test_server.urls(input))
     results = kvgroup(sorted(self.results(self.job)))
     self.assertAllEqual(((k, sum(vs)) for k, vs in results),
                         ((i, 10) for i in input))

コード例 #44

0

ファイルを表示

ファイル: disco_job.py プロジェクト: pooya/github_crawler

def reduce(iter, params):
    from disco.util import kvgroup
    for extension, ratios in kvgroup(sorted(iter)):
        l_ratios = [r for r in ratios]
        yield extension, sum(l_ratios) / len(l_ratios)

コード例 #45

0

ファイルを表示

ファイル: test_sort.py プロジェクト: chinnurtb/disco_playground

 def reduce(iter, params):
     for k, vs in kvgroup(iter):
         yield base64.decodestring(k), len(list(vs))

コード例 #46

0

ファイルを表示

 def reduce(iter, params):
     for k, vs in kvgroup(sorted(iter)):
         yield k, sum(int(v) for v in vs)

コード例 #47

0

ファイルを表示

def pred_featurize_reduce(iter, params):
    '''Generator, implementation of reduce stage in map-reduce process, for model prediction feature generation of time series data. iter is an iterable of tuples containing the file name of a time series data file to be used for featurization, and an unused placeholder string. Yields a two-element tuple containing the file name of the time series data set, and a two-element list containing the extracted features and the original time series data. 
	'''
    from copy import deepcopy
    featset_key = params['featset_key']
    sep = params['sep']
    custom_features_script = params['custom_features_script']
    meta_features = params['meta_features']

    import sys, os
    from disco.util import kvgroup

    import os
    import sys
    PATH_TO_PROJECT_DIRECTORY = os.path.join(os.path.expanduser("~"),
                                             "Dropbox/work_etc/mlweb")
    sys.path.append(PATH_TO_PROJECT_DIRECTORY)
    import cfg
    sys.path.append(cfg.TCP_INGEST_TOOLS_PATH)

    import generate_science_features
    import predict_class as predict
    import build_rf_model
    import lc_tools
    import custom_feature_tools as cft

    for fname, junk in kvgroup(sorted(iter)):
        if os.path.isfile(fname):
            f = open(fname)
        elif os.path.isfile(os.path.join(cfg.UPLOAD_FOLDER, fname)):
            f = open(os.path.join(cfg.UPLOAD_FOLDER, fname))
        else:
            print(fname if cfg.UPLOAD_FOLDER in fname else os.path.join(
                cfg.UPLOAD_FOLDER, fname)) + " is not a file..."
            if os.path.exists(os.path.join(cfg.UPLOAD_FOLDER,
                                           fname)) or os.path.exists(fname):
                print "But it does exist on the disk."
            else:
                print "and in fact it doesn't even exist."
            continue

        lines = f.readlines()
        f.close()
        ts_data = []
        for i in range(len(lines)):
            ts_data.append(lines[i].strip("\n").strip().split(sep))
            if len(ts_data[i]) < len(lines[i].strip("\n").strip().split(",")):
                ts_data[i] = lines[i].strip("\n").strip().split(",")
            if len(ts_data[i]) < len(lines[i].strip("\n").strip().split(" ")):
                ts_data[i] = lines[i].strip("\n").strip().split(" ")
            if len(ts_data[i]) < len(lines[i].strip("\n").strip().split("\t")):
                ts_data[i] = lines[i].strip("\n").strip().split("\t")

            for j in range(len(ts_data[i])):
                ts_data[i][j] = float(ts_data[i][j])
        del lines
        f = open(
            os.path.join(cfg.FEATURES_FOLDER, "%s_features.csv" % featset_key))
        features_in_model = f.readline().strip().split(',')
        f.close()

        features_to_use = features_in_model

        ## generate features:
        if len(list(set(features_to_use) & set(cfg.features_list))) > 0:
            timeseries_features = lc_tools.generate_timeseries_features(
                deepcopy(ts_data), sep=sep, ts_data_passed_directly=True)
        else:
            timeseries_features = {}
        if len(list(set(features_to_use)
                    & set(cfg.features_list_science))) > 0:
            science_features = generate_science_features.generate(
                ts_data=deepcopy(ts_data))
        else:
            science_features = {}
        if custom_features_script:
            custom_features = cft.generate_custom_features(
                custom_script_path=custom_features_script,
                path_to_csv=None,
                features_already_known=dict(timeseries_features.items() +
                                            science_features.items() +
                                            meta_features.items()),
                ts_data=deepcopy(ts_data))
        else:
            custom_features = {}

        all_features = dict(timeseries_features.items() +
                            science_features.items() +
                            custom_features.items() + meta_features.items())

        yield fname, [all_features, ts_data]

コード例 #48

0

ファイルを表示

ファイル: letter_freq.py プロジェクト: AlexArgus/disco

def reduce(iter, params):
    from disco.util import kvgroup
    for char, counts in kvgroup(sorted(iter)):
        yield char, sum(counts)

コード例 #49

0

ファイルを表示

ファイル: sort.py プロジェクト: mahmoudimus/MapReduce-Thorn

def fun_reduce(iter, params):
    for k, v in kvgroup(sorted(iter))
        yield k, sorted(v)

コード例 #50

0

ファイルを表示

ファイル: test_shards.py プロジェクト: sajal/MongoDisco

def reduce(iter, params):
    from disco.util import kvgroup
    for age, counts in kvgroup(sorted(iter)):
        yield age, sum(counts)

コード例 #51

0

ファイルを表示

ファイル: MapReduce_CountDT_Chain.py プロジェクト: pooya/Examples

 def reduce(dt_iter, out, params):
     from disco.util import kvgroup
     for word, counts in kvgroup(sorted(dt_iter)):
         out.add(word, sum(counts))

コード例 #52

0

ファイルを表示

ファイル: test_profile.py プロジェクト: AlexArgus/disco

 def reduce(iter, params):
     for k, vs in kvgroup(sorted(iter)):
         yield k, sum(int(v) for v in vs)

コード例 #53

0

ファイルを表示

ファイル: treasury_yield.py プロジェクト: isabella232/mongo-disco

def reduce(iter, params):
    from disco.util import kvgroup
    for year, bid_prices in kvgroup(sorted(iter)):
        bd = [i for i in bid_prices]
        yield year, sum(bd) / len(bd)