コード例 #1
0
    content_textract_fname = "{}/content_textract".format(content_hash)
    content_textract_exists = file_exists_in_bucket(settings.BUCKET_EXTRA,
                                                    content_textract_fname)
    if not content_textract_exists:
        # this contains javascript/css rubbish - textract bug?
        content_file = tempfile.NamedTemporaryFile(suffix=".html")
        raw_local_fp = open(content_file.name, 'w')
        raw_local_fp.write(str(raw_content))
        raw_local_fp.close()  # flush write
        textract_content = textract.process(content_file.name)
        textract_fp = s3_resource.Object(settings.BUCKET_CONTENTHASH,
                                         content_textract_fname)
        textract_fp.put(Body=str(textract_content))
        content_file.close()  # clean-up temp file, NamedTemporaryFile magic
        raw_fp = s3_resource.Object(settings.BUCKET_CONTENTHASH,
                                    content_raw_fname)
        raw_fp.put(Body=str(raw_content))
    else:
        textract_content = s3_resource.Object(
            settings.BUCKET_CONTENTHASH,
            content_textract_fname).get()["Body"].read()

    data['content_textract'] = str(textract_content)
    print(data)  # DEBUG
    put_json_into_stream(output_stream, data, content_hash)


if __name__ == '__main__':
    main(process_record, settings.STREAM_VERIFIED_RAW,
         settings.STREAM_VERIFIED_TEXTRACT)
コード例 #2
0
    data["readability_score_ColemanLiau"] = ColemanLiau(goose_str).min_age
    data["readability_score_DaleChall"] = DaleChall(goose_str).min_age
    data["readability_score_Flesch"] = Flesch(goose_str).min_age
    data["readability_score_FleschKincaid"] = FleschKincaid(goose_str).min_age
    data["readability_score_LinsearWrite"] = LinsearWrite(goose_str).min_age
    data["readability_score_SMOG"] = SMOG(goose_str).min_age

    put_json_into_stream(output_stream, data, content_hash)
    print("{}.{} processed {}".format(worker_num, counter, content_hash))
    if DEBUG:
        keys = ("readability_score_ARI", "readability_score_ColemanLiau",
                "readability_score_DaleChall", "readability_score_Flesch",
                "readability_score_FleschKincaid",
                "readability_score_LinsearWrite", "readability_score_SMOG")
        try:
            content = data["content_goose"].encode('utf8', 'ignore')
            for k in keys:
                print("{}: {}".format(k, data[k]))
            print(content)
            print(
                "------------------------------------------------------------------"
            )
            print("")
        except:
            pass


if __name__ == '__main__':
    main(process_record, settings.STREAM_VERIFIED_GOOSE,
         settings.STREAM_READABILITY_SCORED)
コード例 #3
0
ファイル: process_raw.py プロジェクト: koriaf/disco_crawl
        # this may contain escape sequences
        raw_content = str(
            s3_resource.Object(settings.BUCKET_CONTENTHASH,
                               content_hash).get()["Body"].read())
        # so, we need to interpret them
        raw_content = bytes(raw_content, "utf-8").decode("unicode_escape")
        #print(bytes(raw_content, 'utf-8').decode('ascii','ignore'))

        content_raw_fname = "{}/content_raw".format(content_hash)
        content_raw_exists = file_exists_in_bucket(settings.BUCKET_EXTRA,
                                                   content_raw_fname)
        if not content_raw_exists:
            # make a copy of the raw content in our working s3
            # this is a debug thing, we can stop doing it later
            raw_fp = s3_resource.Object(settings.BUCKET_EXTRA,
                                        content_raw_fname)
            raw_fp.put(Body=raw_content)
        else:
            # this does not contain escape sequences
            raw_content = s3_resource.Object(
                settings.BUCKET_EXTRA, content_raw_fname).get()["Body"].read()

        data['content_raw_fname'] = content_raw_fname
        put_json_into_stream(output_stream, json.dumps(data), content_hash)
        print("{}.{} processed {}".format(worker_num, counter, content_hash))


if __name__ == '__main__':
    main(process_record, settings.STREAM_QUALIFIED_URLS,
         settings.STREAM_VERIFIED_RAW)
コード例 #4
0
import json
import settings
from base_node import main


def process_record(worker_num, counter, record, output_stream):
    data = json.loads(record['Data'])

    # do something fantastic with the data here
    print(data['contentHash'])


if __name__ == '__main__':
    main(process_record, settings.STREAM_MODEL_INPUT, None)
コード例 #5
0
        try:
            raw_content = bytes(raw_content.decode("unicode_escape"), 'utf-8')
        except:
            print("{}.{} problem decoding {}".format(
                worker_num, counter, content_hash))  # or not...
            return False

        soup = BeautifulSoup(raw_content, "lxml")
        for script in soup(["script", "style"]):
            script.extract()
        bs4_content = bytes(soup.get_text(), 'utf-8')
        #print(bs4_content.decode('ascii', 'ignore'))

        bs4_fp = s3_resource.Object(settings.BUCKET_EXTRA, content_bs4_fname)
        bs4_fp.put(Body=bs4_content)
    else:
        # this does not contain escape characters
        bs4_content = s3_resource.Object(
            settings.BUCKET_EXTRA, content_bs4_fname).get()["Body"].read()

    # do something with bs4_content now?
    data['content_bs4_fname'] = content_bs4_fname
    put_json_into_stream(output_stream, data, content_hash)
    print("{}.{} processed {}".format(worker_num, counter, content_hash))
    #print(json.dumps(data, indent=4))


if __name__ == '__main__':
    main(process_record, settings.STREAM_VERIFIED_RAW,
         settings.STREAM_VERIFIED_BS4)
コード例 #6
0
import json
import settings
from base_node import main, put_json_into_stream


def process_record(worker_num, counter, record, output_stream):
    global kinesis_client
    data = json.loads(record['Data'])

    # do something fantastic with the data here
    msg = "worker {} [record {}] url: {}"
    print(msg.format(worker_num, counter, data['identifier']))

    if output_stream:
        put_json_into_stream(output_stream, data, data['uuid'])


if __name__ == '__main__':
    main(process_record, settings.STREAM_QUALIFIED_URLS, None)
コード例 #7
0
            bs4_content = soup.get_text()
            lines = (line.strip() for line in bs4_content.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            bs4_content = '\n'.join(chunk for chunk in chunks if chunk)
            bs4_fp = s3_resource.Object(settings.BUCKET_CONTENTHASH, content_bs4_fname)
            bs4_fp.put(Body=str(bs4_content))

        content_goose_fname = "{}/content_goose".format(content_hash)
        content_goose_exists = file_exists_in_bucket(settings.BUCKET_EXTRA, content_goose_fname)
        if not content_goose_exists:
            g = Goose()
            article = g.extract(raw_html=raw_content)
            goose_content = article.cleaned_text
            goose_fp = s3_resource.Object(settings.BUCKET_CONTENTHASH, content_goose_fname)
            goose_fp.put(Body=str(goose_content))
        else:
            goose_content = s3_resource.Object(
                settings.BUCKET_CONTENTHASH, content_goose_fname
            ).get()["Body"].read()
        data['content_goose'] = goose_content
        
        put_json_into_stream(output_stream, data, content_hash)
        return True
    else:
        print("ERROR contentHash {} specified but does not exist".format(content_hash))
        return False


if __name__ == '__main__':
    main(process_record, settings.STREAM_QUALIFIED_URLS, settings.STREAM_MODEL_INPUT)