def download_transcripts(contact_ids=None): """Download transcripts for a list of contact_ids. Args: contact_ids (:obj:`list`, optional): List of Contact IDs to retrieve recordings for. If None are provided (default), queries contacts that have not been parsed """ if contact_ids is None: query = """ SELECT contact_id FROM contacts WHERE transcript_downloaded=FALSE AND agent_id IS NOT NULL """ data = postgrez.execute(query=query, host=DB_CONF['host'], user=DB_CONF['user'], password=DB_CONF['pwd'], database=DB_CONF['db']) contact_ids = [record['contact_id'] for record in data] if not contact_ids: LOGGER.warning("No contact ids to parse. Exiting..") return LOGGER.info("Attempting to process %s contact ids", len(contact_ids)) ice = Icescape() for chunked_contact_ids in utils.chunker(contact_ids, 20): transcripts = ice.get_recordings(chunked_contact_ids) if len(transcripts) < len(chunked_contact_ids): retrieved = [ transcript['Value']['ContactID'] for transcript in transcripts ] missing = list(set(chunked_contact_ids) - set(retrieved)) LOGGER.warning('Missing transcripts %s', missing) raise Exception("Transcripts not returned for all contact ids") for contact_id, transcript in zip(chunked_contact_ids, transcripts): filename = "{}_data.txt".format(contact_id) save_data(transcript, filename) update_query = """ UPDATE contacts SET transcript_downloaded=TRUE WHERE contact_id IN ({}) """.format(','.join([str(id) for id in chunked_contact_ids])) postgrez.execute(query=update_query, host=DB_CONF['host'], user=DB_CONF['user'], password=DB_CONF['pwd'], database=DB_CONF['db'])
def enhanced_transcripts(): """Read in un-processed transcripts from Postgres, perform a series of operations to produce metadata per contact_id, load into table enhanced_transcripts """ query = """ SELECT contact_id FROM transcripts WHERE contact_id NOT IN (SELECT contact_id FROM enhanced_transcripts) GROUP BY 1 """ data = postgrez.execute(query, host=DB_CONF['host'], user=DB_CONF['user'], password=DB_CONF['pwd'], database=DB_CONF['db']) to_load = [record['contact_id'] for record in data] transcripts_tforms = config.TRANSFORMS["transcripts"] transcripts_meta_tforms = config.TRANSFORMS['transcript_summary'] optimus = Transformer(transcripts_tforms) megatron = Transformer(transcripts_meta_tforms) delayeds = [] for contact_id in to_load: LOGGER.info('Processing transcript for contact_id %s', contact_id) dataframe = delayed(load_transcripts_df)([contact_id]) dataframe = delayed(optimus.run_df_transforms)(dataframe) summary = delayed(megatron.run_meta_df_transforms)(dataframe) summary = delayed(replace_nans)(summary) out = delayed(load_enhanced_transcript)(contact_id, summary) delayeds.append(out) compute(*delayeds, scheduler='threads', num_workers=20)
def get_transcripts_to_load(): """Grab the filenames of all transcript files that have not been loaded to Postgres Returns: list: List of trancsript files to load """ transcripts_reg = r"^\d*_data.txt" query = "SELECT contact_id FROM transcripts GROUP BY 1" data = postgrez.execute(query, host=DB_CONF['host'], user=DB_CONF['user'], password=DB_CONF['pwd'], database=DB_CONF['db']) loaded_contacts = [record['contact_id'] for record in data] files = utils.search_path(config.ICESCAPE_OUTPUT_DIR, [transcripts_reg]) filenames = [os.path.basename(file) for file in files] to_load = [] for file in filenames: basename = os.path.basename(file) contact_id = int(basename.split('_data.txt')[0]) if contact_id not in loaded_contacts: to_load.append(basename) LOGGER.info("%s transcripts to parse and load", len(to_load)) return to_load
def get_contacts_to_load(): """Grab the filenames of all contact files that have not been loaded to Postgres. Returns: list: List of contact filenames to load """ contacts_reg = r"^\w*_\d{4}-\d{1,2}-\d{1,2}_\w*" query = "SELECT load_file FROM contacts GROUP BY 1" data = postgrez.execute(query, host=DB_CONF['host'], user=DB_CONF['user'], password=DB_CONF['pwd'], database=DB_CONF['db']) loaded_files = [record['load_file'] for record in data] files = utils.search_path(config.ICESCAPE_OUTPUT_DIR, [contacts_reg]) filenames = [os.path.basename(file) for file in files] return list(set(filenames) - set(loaded_files))
def load_transcripts_df(contact_ids): """Load the transcripts data associated with a set of contact_ids into a pandas Dataframe. Args: contact_ids (list): List of contact ids to load Returns: pandas.Dataframe: Dataframe containing the loaded transcripts """ LOGGER.info("Loading transcripts for contact_ids: %s", contact_ids) query = """ SELECT * FROM transcripts WHERE contact_id IN ({}) ORDER BY contact_id, dt ASC """.format(','.join([str(id) for id in contact_ids])) data = postgrez.execute(query, host=DB_CONF['host'], user=DB_CONF['user'], password=DB_CONF['pwd'], database=DB_CONF['db']) dataframe = pd.DataFrame(data) return dataframe
def load_ftci_to_postgres(): # get all files from output dir that have already been loaded db_conf = CONF['database'] query = """ SELECT report_name FROM loaded_reports WHERE report_type='FTCI' --AND load_dt > CURRENT_DATE - 2 -- can't add this part cuase files are still in S3 """ data = postgrez.execute(query=query, host=db_conf['host'], user=db_conf['user'], password=db_conf['pwd'], database=db_conf['db']) loaded_files = [record['report_name'] for record in data] # get files from S3 matching prefix files = utils.get_s3_keys(S3_BUCKET, prefix="V2") unloaded_files = list(set(files) - set(loaded_files)) loaded_reports = [] with postgrez.Load( host=db_conf['host'], user=db_conf['user'], password=db_conf['pwd'], database=db_conf['db']) as load: for key in unloaded_files: contents = utils.read_s3_file(S3_BUCKET, key) if contents == '': loaded_reports.append(key) continue parsed_contents = utils.parse_s3_contents(contents, '|', remove_dupes=True) load.load_from_object(table_name='ftci', data=parsed_contents) loaded_reports.append(key) ## TODO: if one of these queries above fails, the loaded reports part below doesn't run... if len(loaded_reports) > 0: dt = datetime.now() load_data = [[dt, report_name, 'FTCI'] for report_name in loaded_reports] load.load_from_object(table_name='loaded_reports', data=load_data)