Exemple #1
0
def save_dce(dce_data, s3_resource, collection):
    """save_dce(): Save one DCE to AWS Glacier
    """
    annonce_id = dce_data['annonce_id']
    file_types = ['reglement', 'complement', 'avis', 'dce']
    filenames = [
        dce_data['filename_reglement'], dce_data['filename_complement'],
        dce_data['filename_avis'], dce_data['filename_dce']
    ]

    # Checks if the file is not too large to be uploaded using boto3 (max 4Go)
    # If a file is that large, we probably don't want to backup nor index it.
    for file_type, filename in zip(file_types, filenames):
        if not filename:
            continue

        internal_filepath = build_internal_filepath(annonce_id=annonce_id,
                                                    original_filename=filename,
                                                    file_type=file_type)

        file_size = os.path.getsize(internal_filepath)
        if file_size >= 4294967296:
            print('Warning: {} is too large to be saved on AWS Glacier'.format(
                internal_filepath))

            collection.update_one(
                {'annonce_id': annonce_id},
                {'$set': {
                    'state': STATE_GLACIER_KO
                }},
            )
            return

    for file_type, filename in zip(file_types, filenames):
        if not filename:
            continue

        internal_filepath = build_internal_filepath(annonce_id=annonce_id,
                                                    original_filename=filename,
                                                    file_type=file_type)
        internal_filename = os.path.basename(internal_filepath)
        if CONFIG_ENV['env'] != 'production':
            print('Debug: Saving {} on AWS S3 Glacier Deep Archive...'.format(
                internal_filepath))
        s3_resource.meta.client.upload_file(
            Filename=internal_filepath,
            Bucket=CONFIG_AWS_GLACIER['bucket_name'],
            Key=internal_filename,
            ExtraArgs={'StorageClass': 'DEEP_ARCHIVE'})

    collection.update_one(
        {'annonce_id': annonce_id},
        {'$set': {
            'state': STATE_GLACIER_OK
        }},
    )

    if CONFIG_ENV['env'] != 'production':
        print('Debug: Saved {} on AWS Glavier'.format(annonce_id))
Exemple #2
0
def process_link(link, connection, cursor):
    """
    process_link : Download data and store it in database.
    Return the number of stored DCE (0 or 1).
    """
    annonce_id, org_acronym = re.match(LINK_REGEX, link).groups()

    # abort if the DCE is already processed
    cursor.execute(
        "SELECT annonce_id, org_acronym FROM dce WHERE annonce_id = %s AND org_acronym = %s;",
        (annonce_id, org_acronym))
    results = cursor.fetchall()
    if results:
        return 0

    try:
        (annonce_id, org_acronym, links_boamp, reference, intitule, objet,
         reglement_ref, filename_reglement, reglement, filename_complement,
         complement, filename_avis, avis, filename_dce, dce) = fetch_data(link)
    except Exception as exception:
        print("Warning: exception occured ({}) : {}".format(exception, link))
        return 0

    now = datetime.datetime.now()

    file_types = ['reglement', 'complement', 'avis', 'dce']
    filenames = [
        filename_reglement, filename_complement, filename_avis, filename_dce
    ]
    file_contents = [reglement, complement, avis, dce]
    for file_type, filename, file_content in zip(file_types, filenames,
                                                 file_contents):
        if file_content:
            internal_filepath = build_internal_filepath(
                annonce_id, org_acronym, filename, file_type)
            with open(internal_filepath, 'wb') as file_object:
                file_object.write(reglement)

    cursor.execute(
        """
        INSERT INTO dce (
            annonce_id, org_acronym, links_boamp,
            reference, intitule, objet,
            reglement_ref, filename_reglement, filename_complement, filename_avis, filename_dce,
            fetch_datetime,
            state
            )
            VALUES (
            %s, %s, %s,
            %s, %s, %s,
            %s, %s, %s, %s, %s,
            %s,
            %s
           )""",
        (annonce_id, org_acronym, links_boamp, reference, intitule, objet,
         reglement_ref, filename_reglement, filename_complement, filename_avis,
         filename_dce, now, STATE_FETCH_OK))
    connection.commit()
    return 1
Exemple #3
0
 def write_response_to_file(annonce_id, filename, file_type, response):
     internal_filepath = build_internal_filepath(annonce_id=annonce_id,
                                                 original_filename=filename,
                                                 file_type=file_type)
     with open(internal_filepath, 'wb') as file_object:
         for chunk in response.iter_content(8192):
             file_object.write(chunk)
     return os.path.getsize(internal_filepath)
Exemple #4
0
def save_dce(annonce_id, org_acronym, intitule, filename_reglement,
             filename_complement, filename_avis, filename_dce, connection,
             cursor, glacier_client):
    """save_dce(): Save one DCE to AWS Glacier
    """
    file_types = ['reglement', 'complement', 'avis', 'dce']
    filenames = [
        filename_reglement, filename_complement, filename_avis, filename_dce
    ]

    for file_type, filename in zip(file_types, filenames):
        if not filename:
            continue

        archive_description = '{}-{} {} ({}) {}'.format(
            annonce_id, org_acronym, file_type, filename, intitule)
        archive_description = unidecode(archive_description)
        archive_description = archive_description[:1023]

        internal_filepath = build_internal_filepath(annonce_id, org_acronym,
                                                    filename, file_type)
        if CONFIG_ENV['env'] != 'production':
            print('Saving {} on AWS Glavier...'.format(internal_filepath))
            print(archive_description)
        with open(internal_filepath, 'rb') as file_object:
            response = glacier_client.upload_archive(
                vaultName=CONFIG_AWS_GLACIER['vault_name'],
                archiveDescription=archive_description,
                body=file_object,
            )
        assert response['ResponseMetadata'][
            'HTTPStatusCode'] == 201, archive_description
        archive_id = response['archiveId']
        psql_request_template = """
            UPDATE dce
            SET glacier_id_{} = %s
            WHERE annonce_id = %s AND org_acronym = %s
            ;""".format(file_type)
        cursor.execute(psql_request_template,
                       (archive_id, annonce_id, org_acronym))
        connection.commit()

    cursor.execute(
        """
        UPDATE dce
        SET state = %s
        WHERE annonce_id = %s AND org_acronym = %s
        ;""", (STATE_GLACIER_OK, annonce_id, org_acronym))
    connection.commit()

    if CONFIG_ENV['env'] != 'production':
        print('Saved {}-{} on AWS Glavier'.format(annonce_id, org_acronym))
Exemple #5
0
def index_dce(dce_data, tika_server_url):
    """index_dce(): Extract the content of one DCE and give it to ElasticSearch
    """

    try:
        annonce_id = dce_data['annonce_id']

        content_list = []

        file_types = ['reglement', 'complement', 'avis', 'dce']
        filenames = [
            dce_data['filename_reglement'], dce_data['filename_complement'],
            dce_data['filename_avis'], dce_data['filename_dce']
        ]

        for file_type, filename in zip(file_types, filenames):
            if not filename:
                continue

            internal_filepath = build_internal_filepath(
                annonce_id=annonce_id,
                original_filename=filename,
                file_type=file_type)
            if CONFIG_ENV['env'] != 'production':
                print('Debug: Extracting content of {}...'.format(
                    internal_filepath))

            content, embedded_resource_paths = extract_file(
                file_path=internal_filepath, tika_server_url=tika_server_url)

            client = MongoClient()
            collection = client.place.dce
            collection.update_one({'annonce_id': annonce_id}, {
                '$set': {
                    'embedded_filenames_{}'.format(file_type):
                    embedded_resource_paths
                }
            })
            client.close()

            content_list.append(content)

        content = '\n'.join(content_list)

        feed_elastisearch(dce_data=dce_data, content=content)

        client = MongoClient()
        collection = client.place.dce
        collection.update_one({'annonce_id': annonce_id},
                              {'$set': {
                                  'state': STATE_CONTENT_INDEXATION_OK
                              }})
        client.close()

        if CONFIG_ENV['env'] != 'production':
            print('Debug: Extracted content from {}'.format(annonce_id))

    except Exception as exception:
        print("Warning: exception occured, aborting DCE ({}: {}) on {}".format(
            type(exception).__name__, exception, annonce_id))
        traceback.print_exc()

        client = MongoClient()
        collection = client.place.dce
        collection.update_one({'annonce_id': annonce_id},
                              {'$set': {
                                  'state': STATE_CONTENT_INDEXATION_KO
                              }})
        client.close()