Esempio n. 1
0
def download_transcripts(contact_ids=None):
    """Download transcripts for a list of contact_ids.

    Args:
        contact_ids (:obj:`list`, optional): List of Contact IDs to retrieve
            recordings for. If None are provided (default), queries contacts
            that have not been parsed
    """
    if contact_ids is None:
        query = """
            SELECT contact_id FROM contacts WHERE transcript_downloaded=FALSE
            AND agent_id IS NOT NULL
            """
        data = postgrez.execute(query=query,
                                host=DB_CONF['host'],
                                user=DB_CONF['user'],
                                password=DB_CONF['pwd'],
                                database=DB_CONF['db'])
        contact_ids = [record['contact_id'] for record in data]

    if not contact_ids:
        LOGGER.warning("No contact ids to parse. Exiting..")
        return

    LOGGER.info("Attempting to process %s contact ids", len(contact_ids))
    ice = Icescape()
    for chunked_contact_ids in utils.chunker(contact_ids, 20):
        transcripts = ice.get_recordings(chunked_contact_ids)
        if len(transcripts) < len(chunked_contact_ids):
            retrieved = [
                transcript['Value']['ContactID'] for transcript in transcripts
            ]
            missing = list(set(chunked_contact_ids) - set(retrieved))
            LOGGER.warning('Missing transcripts %s', missing)
            raise Exception("Transcripts not returned for all contact ids")
        for contact_id, transcript in zip(chunked_contact_ids, transcripts):
            filename = "{}_data.txt".format(contact_id)
            save_data(transcript, filename)

        update_query = """
            UPDATE contacts SET transcript_downloaded=TRUE
            WHERE contact_id IN ({})
            """.format(','.join([str(id) for id in chunked_contact_ids]))
        postgrez.execute(query=update_query,
                         host=DB_CONF['host'],
                         user=DB_CONF['user'],
                         password=DB_CONF['pwd'],
                         database=DB_CONF['db'])
Esempio n. 2
0
def enhanced_transcripts():
    """Read in un-processed transcripts from Postgres, perform a series of
    operations to produce metadata per contact_id, load into table
    enhanced_transcripts
    """
    query = """
        SELECT contact_id FROM transcripts WHERE contact_id NOT IN
        (SELECT contact_id FROM enhanced_transcripts)
        GROUP BY 1
        """
    data = postgrez.execute(query,
                            host=DB_CONF['host'],
                            user=DB_CONF['user'],
                            password=DB_CONF['pwd'],
                            database=DB_CONF['db'])
    to_load = [record['contact_id'] for record in data]

    transcripts_tforms = config.TRANSFORMS["transcripts"]
    transcripts_meta_tforms = config.TRANSFORMS['transcript_summary']
    optimus = Transformer(transcripts_tforms)
    megatron = Transformer(transcripts_meta_tforms)

    delayeds = []
    for contact_id in to_load:
        LOGGER.info('Processing transcript for contact_id %s', contact_id)
        dataframe = delayed(load_transcripts_df)([contact_id])
        dataframe = delayed(optimus.run_df_transforms)(dataframe)
        summary = delayed(megatron.run_meta_df_transforms)(dataframe)
        summary = delayed(replace_nans)(summary)
        out = delayed(load_enhanced_transcript)(contact_id, summary)
        delayeds.append(out)

    compute(*delayeds, scheduler='threads', num_workers=20)
Esempio n. 3
0
def get_transcripts_to_load():
    """Grab the filenames of all transcript files that have not been loaded to
    Postgres

    Returns:
        list: List of trancsript files to load
    """
    transcripts_reg = r"^\d*_data.txt"
    query = "SELECT contact_id FROM transcripts GROUP BY 1"
    data = postgrez.execute(query,
                            host=DB_CONF['host'],
                            user=DB_CONF['user'],
                            password=DB_CONF['pwd'],
                            database=DB_CONF['db'])
    loaded_contacts = [record['contact_id'] for record in data]
    files = utils.search_path(config.ICESCAPE_OUTPUT_DIR, [transcripts_reg])
    filenames = [os.path.basename(file) for file in files]

    to_load = []
    for file in filenames:
        basename = os.path.basename(file)
        contact_id = int(basename.split('_data.txt')[0])
        if contact_id not in loaded_contacts:
            to_load.append(basename)
    LOGGER.info("%s transcripts to parse and load", len(to_load))
    return to_load
Esempio n. 4
0
def get_contacts_to_load():
    """Grab the filenames of all contact files that have not been loaded to
    Postgres.

    Returns:
        list: List of contact filenames to load
    """
    contacts_reg = r"^\w*_\d{4}-\d{1,2}-\d{1,2}_\w*"
    query = "SELECT load_file FROM contacts GROUP BY 1"
    data = postgrez.execute(query,
                            host=DB_CONF['host'],
                            user=DB_CONF['user'],
                            password=DB_CONF['pwd'],
                            database=DB_CONF['db'])
    loaded_files = [record['load_file'] for record in data]
    files = utils.search_path(config.ICESCAPE_OUTPUT_DIR, [contacts_reg])
    filenames = [os.path.basename(file) for file in files]

    return list(set(filenames) - set(loaded_files))
Esempio n. 5
0
def load_transcripts_df(contact_ids):
    """Load the transcripts data associated with a set of contact_ids into
    a pandas Dataframe.

    Args:
        contact_ids (list): List of contact ids to load

    Returns:
        pandas.Dataframe: Dataframe containing the loaded transcripts
    """
    LOGGER.info("Loading transcripts for contact_ids: %s", contact_ids)
    query = """
        SELECT * FROM transcripts WHERE contact_id IN ({})
        ORDER BY contact_id, dt ASC
        """.format(','.join([str(id) for id in contact_ids]))
    data = postgrez.execute(query,
                            host=DB_CONF['host'],
                            user=DB_CONF['user'],
                            password=DB_CONF['pwd'],
                            database=DB_CONF['db'])
    dataframe = pd.DataFrame(data)
    return dataframe
Esempio n. 6
0
def load_ftci_to_postgres():
    # get all files from output dir that have already been loaded
    db_conf = CONF['database']
    query = """
        SELECT report_name
        FROM loaded_reports
        WHERE report_type='FTCI' --AND load_dt > CURRENT_DATE - 2 -- can't add this part cuase files are still in S3
    """
    data = postgrez.execute(query=query, host=db_conf['host'],
                user=db_conf['user'], password=db_conf['pwd'],
                database=db_conf['db'])
    loaded_files = [record['report_name'] for record in data]
    # get files from S3 matching prefix
    files = utils.get_s3_keys(S3_BUCKET, prefix="V2")

    unloaded_files = list(set(files) - set(loaded_files))

    loaded_reports = []
    with postgrez.Load(
            host=db_conf['host'], user=db_conf['user'],
            password=db_conf['pwd'], database=db_conf['db']) as load:
        for key in unloaded_files:
            contents = utils.read_s3_file(S3_BUCKET, key)
            if contents == '':
                loaded_reports.append(key)
                continue
            parsed_contents = utils.parse_s3_contents(contents, '|',
                                                        remove_dupes=True)
            load.load_from_object(table_name='ftci',
                                    data=parsed_contents)
            loaded_reports.append(key)

        ## TODO: if one of these queries above fails, the loaded reports part below doesn't run...
        if len(loaded_reports) > 0:
            dt = datetime.now()
            load_data = [[dt, report_name, 'FTCI']
                            for report_name in loaded_reports]
            load.load_from_object(table_name='loaded_reports', data=load_data)