コード例 #1
0
ファイル: mr_tf.py プロジェクト: jeffchan/asciiclass
 def reducer_init(self):
     emr = EMRJobRunner(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)
     idf_parts = emr.get_s3_keys('s3://6885public/jeffchan/term-idfs/')
     self.word_to_idf = dict()
     for part in idf_parts:
         json = part.get_contents_as_string()
         for line in StringIO.StringIO(json):
             pair = json.loads(line)
             self.word_to_idf[pair['term']] = pair['idf']
コード例 #2
0
ファイル: mr_tf.py プロジェクト: jeffchan/asciiclass
 def reducer_init(self):
     emr = EMRJobRunner(aws_access_key_id=AWS_ACCESS_KEY,
                        aws_secret_access_key=AWS_SECRET_KEY)
     idf_parts = emr.get_s3_keys('s3://6885public/jeffchan/term-idfs/')
     self.word_to_idf = dict()
     for part in idf_parts:
         json = part.get_contents_as_string()
         for line in StringIO.StringIO(json):
             pair = json.loads(line)
             self.word_to_idf[pair['term']] = pair['idf']
コード例 #3
0
    def reducer_init(self):
        self.idfs = {}

        # Iterate through the files in the bucket provided by the user
        if self.options.aws_access_key_id and self.options.aws_secret_access_key:
            emr = EMRJobRunner(aws_access_key_id=self.options.aws_access_key_id,
                               aws_secret_access_key=self.options.aws_secret_access_key)
        else:
            emr = EMRJobRunner()

        for key in emr.get_s3_keys("s3://" + self.options.idf_loc):
            # Load the whole file first, then read it line-by-line: otherwise,
            # chunks may not be even lines
            for line in StringIO(key.get_contents_as_string()): 
                term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object
                self.idfs[term_idf['term']] = term_idf['idf']
コード例 #4
0
    def reducer_init(self):
        self.idfs = {}

        # Iterate through the files in the bucket provided by the user
        if self.options.aws_access_key_id and self.options.aws_secret_access_key:
            emr = EMRJobRunner(
                aws_access_key_id=self.options.aws_access_key_id,
                aws_secret_access_key=self.options.aws_secret_access_key)
        else:
            emr = EMRJobRunner()

        for key in emr.get_s3_keys("s3://" + self.options.idf_loc):
            # Load the whole file first, then read it line-by-line: otherwise,
            # chunks may not be even lines
            for line in StringIO(key.get_contents_as_string()):
                term_idf = JSONValueProtocol.read(line)[
                    1]  # parse the line as a JSON object
                self.idfs[term_idf['term']] = term_idf['idf']