def test_pulldown_third_party(self): # Add survey answers with open(self.ext_survey_fp, 'rU') as f: obs = db.store_external_survey(f, 'Vioscreen', separator=',', survey_id_col='SubjectId', trim='-160') self.assertEqual(obs, 3) barcodes = ['000029429', '000018046', '000023299', '000023300'] # Test without third party obs, _ = db.pulldown(barcodes) survey = obs[1] self.assertFalse('VIOSCREEN' in survey) obs, _ = db.pulldown(barcodes, blanks=['BLANK.01']) survey = obs[1] self.assertFalse('VIOSCREEN' in survey) self.assertTrue('BLANK.01' in survey) # Test with third party obs, _ = db.pulldown(barcodes, external=['Vioscreen']) survey = obs[1] self.assertTrue('VIOSCREEN' in survey) obs, _ = db.pulldown(barcodes, blanks=['BLANK.01'], external=['Vioscreen']) survey = obs[1] self.assertTrue('VIOSCREEN' in survey) self.assertTrue('BLANK.01' in survey)
def test_pulldown_third_party(self): # Add survey answers with open(self.ext_survey_fp, 'rU') as f: obs = db.store_external_survey( f, 'Vioscreen', separator=',', survey_id_col='SubjectId', trim='-160') self.assertEqual(obs, 3) barcodes = ['000029429', '000018046', '000023299', '000023300'] # Test without third party obs, _ = db.pulldown(barcodes) survey = obs[1] self.assertFalse('VIOSCREEN' in survey) obs, _ = db.pulldown(barcodes, blanks=['BLANK.01']) survey = obs[1] self.assertFalse('VIOSCREEN' in survey) self.assertTrue('BLANK.01' in survey) # Test with third party obs, _ = db.pulldown(barcodes, external=['Vioscreen']) survey = obs[1] self.assertTrue('VIOSCREEN' in survey) obs, _ = db.pulldown(barcodes, blanks=['BLANK.01'], external=['Vioscreen']) survey = obs[1] self.assertTrue('VIOSCREEN' in survey) self.assertTrue('BLANK.01' in survey)
def test_align_with_qiita_categories(self): samples = ['000004216', '000017291', '000004215'] # apparently the call to pulldown is not idempotent # the first call is != to the second, but the second # is equal to the third. db.pulldown(samples) data = db.pulldown(samples) data_as_pd = pd.read_csv(StringIO.StringIO(data[0][1]), sep='\t', dtype=str) data_as_pd.set_index('sample_name', inplace=True) data_as_pd.columns = [c.lower() for c in data_as_pd.columns] # as of 15august2019, 000017291 does not successfully pulldown. this # sample has an inconsistency in the metadata that triggers a failure # condition. This test SHOULD fail when metadata pulldown is # successfully revisited. self.assertFalse('000017291' in data_as_pd.index) nc = len(data_as_pd.columns) data_as_pd = data_as_pd.append( pd.Series(['pulldown-issue'] * nc, index=data_as_pd.columns, name='000017291')) # per a request from Gail data_as_pd.loc['000017291', 'env_package'] = 'Air' for c in set(AG_DEBUG_OBSERVED_CATEGORIES) - set(data_as_pd.columns): data_as_pd[c] = 'Missing: Not provided' exp = { '000004216': data_as_pd.loc['000004216'].to_dict(), '000017291': data_as_pd.loc['000017291'].to_dict(), '000004215': data_as_pd.loc['000004215'].to_dict() } obs = align_with_qiita_categories(samples, AG_DEBUG_OBSERVED_CATEGORIES) # for an undetermined reason, simply testing equality on the obs # and exp dicts is very time consuming. self.assertEqual(sorted(obs.keys()), sorted(exp.keys())) for k in obs.keys(): o_items = sorted(obs[k].items()) e_items = sorted(exp[k].items()) self.assertEqual(o_items, e_items)
def post(self): barcodes = self.get_argument('barcodes').split(',') if self.get_argument('blanks'): blanks = self.get_argument('blanks').split(',') else: blanks = [] if self.get_argument('external'): external = self.get_argument('external').split(',') else: external = [] # Get metadata and create zip file metadata, failures = db.pulldown(barcodes, blanks, external) meta_zip = InMemoryZip() failed = '\n'.join(['\t'.join(bc) for bc in viewitems(failures)]) failtext = ("The following barcodes were not retrieved " "for any survey:\n%s" % failed) meta_zip.append("failures.txt", failtext) for survey, meta in viewitems(metadata): meta_zip.append('survey_%s_md.txt' % survey, meta) # write out zip file self.add_header('Content-type', 'application/octet-stream') self.add_header('Content-Transfer-Encoding', 'binary') self.add_header('Accept-Ranges', 'bytes') self.add_header('Content-Encoding', 'none') self.add_header('Content-Disposition', 'attachment; filename=metadata.zip') self.write(meta_zip.write_to_buffer()) self.flush() self.finish()
def get_ag_details(self, barcode): ag_details = db.getAGBarcodeDetails(barcode) _, failures = db.pulldown([barcode], []) if len(ag_details) == 0 and failures: div_id = "no_metadata" message = "Cannot retrieve metadata: %s" % failures[barcode] elif len(ag_details) > 0: for col, val in ag_details.iteritems(): if val is None: ag_details[col] = '' ag_details['other_checked'] = '' ag_details['overloaded_checked'] = '' ag_details['moldy_checked'] = '' ag_details['login_user'] = ag_details['name'] if ag_details['moldy'] == 'Y': ag_details['moldy_checked'] = 'checked' if ag_details['overloaded'] == 'Y': ag_details['overloaded_checked'] = 'checked' if ag_details['other'] == 'Y': ag_details['other_checked'] = 'checked' survey_id = db.get_barcode_survey(barcode) # it has all sample details # (sample time, date, site) if failures: div_id = "no_metadata" message = "Cannot retrieve metadata: %s" % failures[barcode] ag_details['email_type'] = "-1" elif (survey_id is None and ag_details['environment_sampled']) \ or survey_id in survey_type: div_id = "verified" message = "All good" ag_details['email_type'] = "1" else: # should never get here (this would happen # if the metadata # pulldown returned more than one row for a # single barcode) div_id = "md_pulldown_error" message = ("This barcode has multiple entries " "in the database, which should " "never happen. Please notify " "someone on the database crew.") ag_details['email_type'] = "-1" else: # TODO: Stefan Janssen: I cannot see how this case should ever be # reached, since failures will be set to 'Unknown reason' at the # outmost. div_id = "not_assigned" message = ("In American Gut project group but no " "American Gut info for barcode") ag_details['email_type'] = "-1" return div_id, message, ag_details
def test_scrubb_pet_freetext(self): # we had the problem that survey question 150 = 'pets_other_freetext' # was exported for pulldown, but it has the potential to carry personal # information. # this is a barcode where an answer to this question is stored in DB barcodes = ['000037487'] # get free text value from DB all_survey_info = db.get_surveys(barcodes) freetextvalue = all_survey_info[1]['000037487']['pets_other_freetext'] # make sure free text value does NOT show up in pulldown obs_pulldown = db.pulldown(barcodes)[0] for row in obs_pulldown.keys(): self.assertNotIn(freetextvalue, obs_pulldown[row])
def test_pulldown_third_party(self): # Add survey answers with open(self.ext_survey_fp, 'rU') as f: obs = db.store_external_survey(f, 'Vioscreen', separator=',', survey_id_col='SubjectId', trim='-160') self.assertEqual(obs, 3) barcodes = ['000029429', '000018046', '000023299', '000023300'] # Test without third party obs, _ = db.pulldown(barcodes) # Parse the metadata into a pandas dataframe to test some invariants # This tests does not ensure that the columns have the exact value # but at least ensure that the contents looks as expected survey_df = pd.read_csv(StringIO(obs[1]), delimiter='\t', dtype=str, encoding='utf-8') survey_df.set_index('sample_name', inplace=True, drop=True) # Make sure that the prohibited columns from EBI are not in the # pulldown self.assertEqual( set(survey_df.columns).intersection(ebi_remove), set()) freq_accepted_vals = { 'Never', 'Rarely (a few times/month)', 'Regularly (3-5 times/week)', 'Occasionally (1-2 times/week)', 'Unspecified', 'Daily' } freq_cols = [ 'ALCOHOL_FREQUENCY', 'PROBIOTIC_FREQUENCY', 'ONE_LITER_OF_WATER_A_DAY_FREQUENCY', 'POOL_FREQUENCY', 'FLOSSING_FREQUENCY', 'COSMETICS_FREQUENCY' ] for col in freq_cols: vals = set(survey_df[col]) self.assertTrue(all([x in freq_accepted_vals for x in vals])) # This astype is making sure that the values in the BMI column are # values that can be casted to float. survey_df[survey_df.BMI != 'Unspecified'].BMI.astype(float) body_product_values = set(survey_df.BODY_PRODUCT) self.assertTrue( all([ x.startswith('UBERON') or x == 'Unspecified' for x in body_product_values ])) survey = obs[1] self.assertFalse('VIOSCREEN' in survey) obs, _ = db.pulldown(barcodes, blanks=['BLANK.01']) survey = obs[1] self.assertFalse('VIOSCREEN' in survey) self.assertTrue('BLANK.01' in survey) # Test with third party obs, _ = db.pulldown(barcodes, external=['Vioscreen']) survey = obs[1] self.assertTrue('VIOSCREEN' in survey) obs, _ = db.pulldown(barcodes, blanks=['BLANK.01'], external=['Vioscreen']) survey = obs[1] self.assertTrue('VIOSCREEN' in survey) self.assertTrue('BLANK.01' in survey)
def post(self): barcodes = listify(self.get_arguments('barcodes')) blanks = listify(self.get_arguments('blanks')) # query which surveys have been selected by the user selected_ag_surveys = listify( self.get_arguments('selected_ag_surveys')) external = listify(self.get_arguments('external')) selected_ag_surveys = list(map(int, selected_ag_surveys)) # Get metadata and create zip file metadata, failures = db.pulldown(barcodes, blanks, external) meta_zip = InMemoryZip() failed = '\n'.join(['\t'.join(bc) for bc in viewitems(failures)]) failtext = ("The following barcodes were not retrieved " "for any survey:\n%s" % failed) meta_zip.append("failures.txt", failtext) # check database about what surveys are available available_agsurveys = {} for (_id, name, _) in db.list_ag_surveys(): available_agsurveys[_id] = name.replace(' ', '_') results_as_pd = [] for survey, meta in viewitems(metadata): # only create files for those surveys that have been selected by # the user. Note that ids from the DB are negative, in metadata # they are positive! # Currently, I (Stefan Janssen) don't have test data for external # surveys, thus I don't know their 'survey' value. I expect it to # be the name of the external survey. In order to not block their # pulldown I check that a skipped survey ID must be in the set of # all available surveys. survey = -1 * survey if (survey in selected_ag_surveys) or \ (survey not in available_agsurveys): meta_zip.append( 'survey_%s_md.txt' % available_agsurveys[survey], meta) # transform each survey into a pandas dataframe for later merge # read all columns as string to avoid unintened conversions, # like cutting leading zeros of barcodes pd_meta = pd.read_csv(StringIO(meta), sep="\t", dtype=str) # reset the index to barcodes = here sample_name pd_meta.set_index('sample_name', inplace=True) results_as_pd.append(pd_meta) # add the merged table of all selected surveys to the zip archive if self.get_argument('merged', default='False') == 'True': pd_all = pd.DataFrame() if len(results_as_pd) > 0: pd_all = pd.concat(results_as_pd, join='outer', axis=1) meta_zip.append( 'surveys_merged_md.txt', pd_all.to_csv(sep='\t', index_label='sample_name')) # write out zip file self.add_header('Content-type', 'application/octet-stream') self.add_header('Content-Transfer-Encoding', 'binary') self.add_header('Accept-Ranges', 'bytes') self.add_header('Content-Encoding', 'none') self.add_header('Content-Disposition', 'attachment; filename=metadata.zip') self.write(meta_zip.write_to_buffer()) self.flush() self.finish()
def align_with_qiita_categories(samples, categories, failure_value='pulldown-issue', omitted_value='Missing: Not provided'): """Obtain sample metadata, and subset to those categories present in Qiita Parameters ---------- samples : list of str The samples to get metadata for categories : Iterable of str The categories to align against failure_value : str, optional The default value to use for a sample that failed pulldown. omitted_value : str, optional The default value to use for a variable not represented either in Qiita or the extracted metadata. Notes ----- The env_package variable for failures will be autofilled with "Air" per a request from Gail. Any variable in extract metadata that is not represented in Qiita will be silently omitted (e.g., PM_USEFUL). Any variable in Qiita that is not represented in the extracted metadata (e.g., qiita_empo_1) will be filled with the omitted_value. Returns ------- dict of dict A stucture of the metadata per sample. {sample-id: {category: value}} """ surveys, failures = db.pulldown(samples) # pulldown returns a per-survey (e.g., primary, fermented food, etc) tab # delimited file. What we're doing here is de-serializing those data into # per survey DataFrames, and then concatenating them together such that # each sample ID is a row, each sample ID is only represented once, and the # columns correspond to variables from each survey type. surveys_as_df = [] for _, v in sorted(surveys.items()): surveys_as_df.append( pd.read_csv(StringIO.StringIO(v), sep='\t', dtype=str).set_index('sample_name')) surveys_as_df = pd.concat(surveys_as_df, axis=1) # oddly, it seems possible in the present pulldown code for an ID to be # successful and a failure failures = {f for f in failures if f not in surveys_as_df.index} # columns in Qiita are lower case surveys_as_df.columns = [c.lower() for c in surveys_as_df.columns] # subset the frame to the overlapping columns categories = set(categories) column_overlap = surveys_as_df.columns.intersection(categories) surveys_as_df = surveys_as_df[column_overlap] # missing categories are those in qiita but not in the pulldown missing_categories = categories - set(column_overlap) # represent failures in the dataframe failures_as_df = pd.DataFrame(index=list(failures), columns=surveys_as_df.columns) failures_as_df.fillna(failure_value, inplace=True) failures_as_df['env_package'] = 'Air' # per request from Gail # append will add rows aligned on the columns surveys_as_df = surveys_as_df.append(failures_as_df) # represent missing entries in the dataframe missing = pd.DataFrame(index=list(surveys_as_df.index), columns=sorted(missing_categories)) missing.fillna(omitted_value, inplace=True) # join will add columns aligned on the index surveys_as_df = surveys_as_df.join(missing) return surveys_as_df.to_dict(orient='index')
def test_pulldown_third_party(self): # Add survey answers with open(self.ext_survey_fp, "rU") as f: obs = db.store_external_survey(f, "Vioscreen", separator=",", survey_id_col="SubjectId", trim="-160") self.assertEqual(obs, 3) barcodes = ["000029429", "000018046", "000023299", "000023300"] # Test without third party obs, _ = db.pulldown(barcodes) # Parse the metadata into a pandas dataframe to test some invariants # This tests does not ensure that the columns have the exact value # but at least ensure that the contents looks as expected survey_df = pd.read_csv(StringIO(obs[1]), delimiter="\t", dtype=str, encoding="utf-8") survey_df.set_index("sample_name", inplace=True, drop=True) # Make sure that the prohibited columns from EBI are not in the # pulldown self.assertEqual(set(survey_df.columns).intersection(ebi_remove), set()) freq_accepted_vals = { "Never", "Rarely (a few times/month)", "Regularly (3-5 times/week)", "Occasionally (1-2 times/week)", "Unspecified", "Daily", } freq_cols = [ "ALCOHOL_FREQUENCY", "PROBIOTIC_FREQUENCY", "ONE_LITER_OF_WATER_A_DAY_FREQUENCY", "POOL_FREQUENCY", "FLOSSING_FREQUENCY", "COSMETICS_FREQUENCY", ] for col in freq_cols: vals = set(survey_df[col]) self.assertTrue(all([x in freq_accepted_vals for x in vals])) # This astype is making sure that the values in the BMI column are # values that can be casted to float. survey_df.BMI.astype(float) body_product_values = set(survey_df.BODY_PRODUCT) self.assertTrue(all([x.startswith("UBERON") or x == "Unspecified" for x in body_product_values])) survey = obs[1] self.assertFalse("VIOSCREEN" in survey) obs, _ = db.pulldown(barcodes, blanks=["BLANK.01"]) survey = obs[1] self.assertFalse("VIOSCREEN" in survey) self.assertTrue("BLANK.01" in survey) # Test with third party obs, _ = db.pulldown(barcodes, external=["Vioscreen"]) survey = obs[1] self.assertTrue("VIOSCREEN" in survey) obs, _ = db.pulldown(barcodes, blanks=["BLANK.01"], external=["Vioscreen"]) survey = obs[1] self.assertTrue("VIOSCREEN" in survey) self.assertTrue("BLANK.01" in survey)
def test_pulldown_third_party(self): # Add survey answers with open(self.ext_survey_fp, 'rU') as f: obs = db.store_external_survey( f, 'Vioscreen', separator=',', survey_id_col='SubjectId', trim='-160') self.assertEqual(obs, 3) barcodes = ['000029429', '000018046', '000023299', '000023300'] # Test without third party obs, _ = db.pulldown(barcodes) # Parse the metadata into a pandas dataframe to test some invariants # This tests does not ensure that the columns have the exact value # but at least ensure that the contents looks as expected survey_df = pd.read_csv( StringIO(obs[1]), delimiter='\t', dtype=str, encoding='utf-8') survey_df.set_index('sample_name', inplace=True, drop=True) # Make sure that the prohibited columns from EBI are not in the # pulldown self.assertEqual(set(survey_df.columns).intersection(ebi_remove), set()) freq_accepted_vals = { 'Never', 'Rarely (a few times/month)', 'Regularly (3-5 times/week)', 'Occasionally (1-2 times/week)', 'Unspecified', 'Daily'} freq_cols = ['ALCOHOL_FREQUENCY', 'PROBIOTIC_FREQUENCY', 'ONE_LITER_OF_WATER_A_DAY_FREQUENCY', 'POOL_FREQUENCY', 'FLOSSING_FREQUENCY', 'COSMETICS_FREQUENCY'] for col in freq_cols: vals = set(survey_df[col]) self.assertTrue(all([x in freq_accepted_vals for x in vals])) # This astype is making sure that the values in the BMI column are # values that can be casted to float. survey_df[survey_df.BMI != 'Unspecified'] .BMI.astype(float) body_product_values = set(survey_df.BODY_PRODUCT) self.assertTrue(all([x.startswith('UBERON') or x == 'Unspecified' for x in body_product_values])) survey = obs[1] self.assertFalse('VIOSCREEN' in survey) obs, _ = db.pulldown(barcodes, blanks=['BLANK.01']) survey = obs[1] self.assertFalse('VIOSCREEN' in survey) self.assertTrue('BLANK.01' in survey) # Test with third party obs, _ = db.pulldown(barcodes, external=['Vioscreen']) survey = obs[1] self.assertTrue('VIOSCREEN' in survey) obs, _ = db.pulldown(barcodes, blanks=['BLANK.01'], external=['Vioscreen']) survey = obs[1] self.assertTrue('VIOSCREEN' in survey) self.assertTrue('BLANK.01' in survey)
def get_ag_details(self, barcode): ag_details = db.getAGBarcodeDetails(barcode) if len(ag_details) > 0: for col, val in ag_details.iteritems(): if val is None: ag_details[col] = '' ag_details['other_checked'] = '' ag_details['overloaded_checked'] = '' ag_details['moldy_checked'] = '' ag_details['login_user'] = ag_details['name'] if ag_details['moldy'] == 'Y': ag_details['moldy_checked'] = 'checked' if ag_details['overloaded'] == 'Y': ag_details['overloaded_checked'] = 'checked' if ag_details['other'] == 'Y': ag_details['other_checked'] = 'checked' survey_id = db.get_barcode_survey(barcode) _, failures = db.pulldown([barcode]) if not (ag_details['sample_date'] == ag_details['site_sampled'] == ag_details['sample_time'] == ''): # it has all sample details # (sample time, date, site) if survey_id is None: div_id = "not_assigned" message = "Missing info" ag_details['email_type'] = "0" elif barcode in failures: div_id = "no_metadata" message = "Cannot retrieve metadata" ag_details['email_type'] = "-1" elif survey_type[survey_id] == 'Human': # and we can successfully retrieve sample # metadata div_id = "verified" message = "All good" ag_details['email_type'] = "1" elif survey_type[survey_id] == 'Animal': div_id = "verified_animal" message = "All good" ag_details['email_type'] = "1" else: # should never get here (this would happen # if the metadata # pulldown returned more than one row for a # single barcode) div_id = "md_pulldown_error" message = ("This barcode has multiple entries " "in the database, which should " "never happen. Please notify " "someone on the database crew.") ag_details['email_type'] = "-1" else: div_id = "not_assigned" message = ("In American Gut project group but No " "American Gut info for barcode") ag_details['email_type'] = "-1" else: div_id = "not_assigned" message = ("In American Gut project group but No " "American Gut info for barcode") ag_details['email_type'] = "-1" return div_id, message, ag_details
def post(self): barcodes = listify(self.get_arguments('barcodes')) blanks = listify(self.get_arguments('blanks')) # query which surveys have been selected by the user selected_ag_surveys = listify( self.get_arguments('selected_ag_surveys')) external = listify(self.get_arguments('external')) selected_ag_surveys = list(map(int, selected_ag_surveys)) # Get metadata and create zip file metadata, failures = db.pulldown(barcodes, blanks, external) meta_zip = InMemoryZip() failed = '\n'.join(['\t'.join(bc) for bc in viewitems(failures)]) failtext = ("The following barcodes were not retrieved " "for any survey:\n%s" % failed) meta_zip.append("failures.txt", failtext) # check database about what surveys are available available_agsurveys = {} for (_id, name, _) in db.list_ag_surveys(): available_agsurveys[_id] = name.replace(' ', '_') results_as_pd = [] for survey, meta in viewitems(metadata): # only create files for those surveys that have been selected by # the user. Note that ids from the DB are negative, in metadata # they are positive! # Currently, I (Stefan Janssen) don't have test data for external # surveys, thus I don't know their 'survey' value. I expect it to # be the name of the external survey. In order to not block their # pulldown I check that a skipped survey ID must be in the set of # all available surveys. survey = -1 * survey if (survey in selected_ag_surveys) or \ (survey not in available_agsurveys): meta_zip.append('survey_%s_md.txt' % available_agsurveys[survey], meta) # transform each survey into a pandas dataframe for later merge # read all columns as string to avoid unintened conversions, # like cutting leading zeros of barcodes pd_meta = pd.read_csv(StringIO(meta), sep="\t", dtype=str) # reset the index to barcodes = here sample_name pd_meta.set_index('sample_name', inplace=True) results_as_pd.append(pd_meta) # add the merged table of all selected surveys to the zip archive if self.get_argument('merged', default='False') == 'True': pd_all = pd.DataFrame() if len(results_as_pd) > 0: pd_all = pd.concat(results_as_pd, join='outer', axis=1) meta_zip.append('surveys_merged_md.txt', pd_all.to_csv(sep='\t', index_label='sample_name')) # write out zip file self.add_header('Content-type', 'application/octet-stream') self.add_header('Content-Transfer-Encoding', 'binary') self.add_header('Accept-Ranges', 'bytes') self.add_header('Content-Encoding', 'none') self.add_header('Content-Disposition', 'attachment; filename=metadata.zip') self.write(meta_zip.write_to_buffer()) self.flush() self.finish()