def reducer_init(self): emr = EMRJobRunner(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY) idf_parts = emr.get_s3_keys('s3://6885public/jeffchan/term-idfs/') self.word_to_idf = dict() for part in idf_parts: json = part.get_contents_as_string() for line in StringIO.StringIO(json): pair = json.loads(line) self.word_to_idf[pair['term']] = pair['idf']
def reducer_init(self): self.idfs = {} # Iterate through the files in the bucket provided by the user if self.options.aws_access_key_id and self.options.aws_secret_access_key: emr = EMRJobRunner(aws_access_key_id=self.options.aws_access_key_id, aws_secret_access_key=self.options.aws_secret_access_key) else: emr = EMRJobRunner() for key in emr.get_s3_keys("s3://" + self.options.idf_loc): # Load the whole file first, then read it line-by-line: otherwise, # chunks may not be even lines for line in StringIO(key.get_contents_as_string()): term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object self.idfs[term_idf['term']] = term_idf['idf']
def reducer_init(self): self.idfs = {} # Iterate through the files in the bucket provided by the user if self.options.aws_access_key_id and self.options.aws_secret_access_key: emr = EMRJobRunner( aws_access_key_id=self.options.aws_access_key_id, aws_secret_access_key=self.options.aws_secret_access_key) else: emr = EMRJobRunner() for key in emr.get_s3_keys("s3://" + self.options.idf_loc): # Load the whole file first, then read it line-by-line: otherwise, # chunks may not be even lines for line in StringIO(key.get_contents_as_string()): term_idf = JSONValueProtocol.read(line)[ 1] # parse the line as a JSON object self.idfs[term_idf['term']] = term_idf['idf']