content_textract_fname = "{}/content_textract".format(content_hash) content_textract_exists = file_exists_in_bucket(settings.BUCKET_EXTRA, content_textract_fname) if not content_textract_exists: # this contains javascript/css rubbish - textract bug? content_file = tempfile.NamedTemporaryFile(suffix=".html") raw_local_fp = open(content_file.name, 'w') raw_local_fp.write(str(raw_content)) raw_local_fp.close() # flush write textract_content = textract.process(content_file.name) textract_fp = s3_resource.Object(settings.BUCKET_CONTENTHASH, content_textract_fname) textract_fp.put(Body=str(textract_content)) content_file.close() # clean-up temp file, NamedTemporaryFile magic raw_fp = s3_resource.Object(settings.BUCKET_CONTENTHASH, content_raw_fname) raw_fp.put(Body=str(raw_content)) else: textract_content = s3_resource.Object( settings.BUCKET_CONTENTHASH, content_textract_fname).get()["Body"].read() data['content_textract'] = str(textract_content) print(data) # DEBUG put_json_into_stream(output_stream, data, content_hash) if __name__ == '__main__': main(process_record, settings.STREAM_VERIFIED_RAW, settings.STREAM_VERIFIED_TEXTRACT)
data["readability_score_ColemanLiau"] = ColemanLiau(goose_str).min_age data["readability_score_DaleChall"] = DaleChall(goose_str).min_age data["readability_score_Flesch"] = Flesch(goose_str).min_age data["readability_score_FleschKincaid"] = FleschKincaid(goose_str).min_age data["readability_score_LinsearWrite"] = LinsearWrite(goose_str).min_age data["readability_score_SMOG"] = SMOG(goose_str).min_age put_json_into_stream(output_stream, data, content_hash) print("{}.{} processed {}".format(worker_num, counter, content_hash)) if DEBUG: keys = ("readability_score_ARI", "readability_score_ColemanLiau", "readability_score_DaleChall", "readability_score_Flesch", "readability_score_FleschKincaid", "readability_score_LinsearWrite", "readability_score_SMOG") try: content = data["content_goose"].encode('utf8', 'ignore') for k in keys: print("{}: {}".format(k, data[k])) print(content) print( "------------------------------------------------------------------" ) print("") except: pass if __name__ == '__main__': main(process_record, settings.STREAM_VERIFIED_GOOSE, settings.STREAM_READABILITY_SCORED)
# this may contain escape sequences raw_content = str( s3_resource.Object(settings.BUCKET_CONTENTHASH, content_hash).get()["Body"].read()) # so, we need to interpret them raw_content = bytes(raw_content, "utf-8").decode("unicode_escape") #print(bytes(raw_content, 'utf-8').decode('ascii','ignore')) content_raw_fname = "{}/content_raw".format(content_hash) content_raw_exists = file_exists_in_bucket(settings.BUCKET_EXTRA, content_raw_fname) if not content_raw_exists: # make a copy of the raw content in our working s3 # this is a debug thing, we can stop doing it later raw_fp = s3_resource.Object(settings.BUCKET_EXTRA, content_raw_fname) raw_fp.put(Body=raw_content) else: # this does not contain escape sequences raw_content = s3_resource.Object( settings.BUCKET_EXTRA, content_raw_fname).get()["Body"].read() data['content_raw_fname'] = content_raw_fname put_json_into_stream(output_stream, json.dumps(data), content_hash) print("{}.{} processed {}".format(worker_num, counter, content_hash)) if __name__ == '__main__': main(process_record, settings.STREAM_QUALIFIED_URLS, settings.STREAM_VERIFIED_RAW)
import json import settings from base_node import main def process_record(worker_num, counter, record, output_stream): data = json.loads(record['Data']) # do something fantastic with the data here print(data['contentHash']) if __name__ == '__main__': main(process_record, settings.STREAM_MODEL_INPUT, None)
try: raw_content = bytes(raw_content.decode("unicode_escape"), 'utf-8') except: print("{}.{} problem decoding {}".format( worker_num, counter, content_hash)) # or not... return False soup = BeautifulSoup(raw_content, "lxml") for script in soup(["script", "style"]): script.extract() bs4_content = bytes(soup.get_text(), 'utf-8') #print(bs4_content.decode('ascii', 'ignore')) bs4_fp = s3_resource.Object(settings.BUCKET_EXTRA, content_bs4_fname) bs4_fp.put(Body=bs4_content) else: # this does not contain escape characters bs4_content = s3_resource.Object( settings.BUCKET_EXTRA, content_bs4_fname).get()["Body"].read() # do something with bs4_content now? data['content_bs4_fname'] = content_bs4_fname put_json_into_stream(output_stream, data, content_hash) print("{}.{} processed {}".format(worker_num, counter, content_hash)) #print(json.dumps(data, indent=4)) if __name__ == '__main__': main(process_record, settings.STREAM_VERIFIED_RAW, settings.STREAM_VERIFIED_BS4)
import json import settings from base_node import main, put_json_into_stream def process_record(worker_num, counter, record, output_stream): global kinesis_client data = json.loads(record['Data']) # do something fantastic with the data here msg = "worker {} [record {}] url: {}" print(msg.format(worker_num, counter, data['identifier'])) if output_stream: put_json_into_stream(output_stream, data, data['uuid']) if __name__ == '__main__': main(process_record, settings.STREAM_QUALIFIED_URLS, None)
bs4_content = soup.get_text() lines = (line.strip() for line in bs4_content.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) bs4_content = '\n'.join(chunk for chunk in chunks if chunk) bs4_fp = s3_resource.Object(settings.BUCKET_CONTENTHASH, content_bs4_fname) bs4_fp.put(Body=str(bs4_content)) content_goose_fname = "{}/content_goose".format(content_hash) content_goose_exists = file_exists_in_bucket(settings.BUCKET_EXTRA, content_goose_fname) if not content_goose_exists: g = Goose() article = g.extract(raw_html=raw_content) goose_content = article.cleaned_text goose_fp = s3_resource.Object(settings.BUCKET_CONTENTHASH, content_goose_fname) goose_fp.put(Body=str(goose_content)) else: goose_content = s3_resource.Object( settings.BUCKET_CONTENTHASH, content_goose_fname ).get()["Body"].read() data['content_goose'] = goose_content put_json_into_stream(output_stream, data, content_hash) return True else: print("ERROR contentHash {} specified but does not exist".format(content_hash)) return False if __name__ == '__main__': main(process_record, settings.STREAM_QUALIFIED_URLS, settings.STREAM_MODEL_INPUT)