def calculate(finfo): try: size = getsize(finfo['fullpath']) except (FileNotFoundError, PermissionError): return finfo, None, None checksum = compute_checksum(finfo['fullpath']) return finfo['filepath_id'], checksum, size
def test_update_raw_data_from_cmd_rd_id(self): rd = update_raw_data_from_cmd(self.filepaths, self.filepaths_types, self.study.id, self.study.raw_data()[0]) # Make sure that we are cleaning the environment for _, fp, _ in rd.get_filepaths(): self._clean_up_files.append(fp) # The checkums are in filepath order. If we sort the rd.get_filepath() # result by the filepath (itemgetter(1)) we will get them in the same # order, so the checksums will not fail for obs, exp in zip(sorted(rd.get_filepaths(), key=itemgetter(1)), self.checksums): self.assertEqual(compute_checksum(obs[1]), exp)
def setUp(self): fd, seqs_fp = mkstemp(suffix='_seqs.fastq') close(fd) fd, barcodes_fp = mkstemp(suffix='_barcodes.fastq') close(fd) self.filepaths = [seqs_fp, barcodes_fp] self.checksums = [] for fp in sorted(self.filepaths): with open(fp, 'w') as f: f.write("%s\n" % fp) self.checksums.append(compute_checksum(fp)) self.filepaths_types = ["raw_forward_seqs", "raw_barcodes"] self._clean_up_files = [seqs_fp, barcodes_fp] info = { "timeseries_type_id": 1, "metadata_complete": True, "mixs_compliant": True, "number_samples_collected": 25, "number_samples_promised": 28, "study_alias": "FCM", "study_description": "Microbiome of people who eat nothing but " "fried chicken", "study_abstract": "Exploring how a high fat diet changes the " "gut microbiome", "emp_person_id": StudyPerson(2), "principal_investigator_id": StudyPerson(3), "lab_person_id": StudyPerson(1) } self.new_study = Study.create(User("*****@*****.**"), "Update raw data test", efo=[1], info=info) self.study = Study(1) # The files for the RawData object attached to study 1 does not exist. # Create them so we can actually perform the tests for _, fp, _ in RawData(1).get_filepaths(): with open(fp, 'w') as f: f.write('\n') self._clean_up_files.append(fp) self.uploaded_files = get_files_from_uploads_folders( str(self.study.id))
def test_compute_checksum(self): """Correctly returns the file checksum""" obs = compute_checksum(self.filepath) exp = 1719580229 self.assertEqual(obs, exp)
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update({sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update( {sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add( sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [ basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id ]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id
for vv in v: to_merge[vv] = k merge_fn = (lambda id_, x: to_merge[id_] if id_ in to_merge else id_) t = t.collapse(merge_fn, norm=False, min_group_size=1, axis='observation', collapse_f=collapse_f) else: ids_to_replace = {c: c.upper() for c in current if c != c.upper()} t.update_ids(ids_to_replace, axis='observation', strict=False, inplace=True) with biom_open(biom, 'w') as f: t.to_hdf5(f, t.generated_by) checksum = compute_checksum(biom) TRN.add(sql, [checksum, ftps['biom'][0]]) fna = ftps['preprocessed_fasta'][1] tmp = fna + '.tmp' with open(tmp, 'w') as out: for seq in t.ids('observation'): out.write('>%s\n%s\n' % (seq, seq)) rename(tmp, fna) checksum = compute_checksum(fna) TRN.add(sql, [checksum, ftps['preprocessed_fasta'][0]]) TRN.execute()
# it nicely with TRN: checksum = None if fpt == 'biom': t = load_table(fp) current = t.ids('observation') updated = map(lambda x: x.upper(), current) if len(set(updated)) != len(updated): print('************>', a.id, fp, '<**************') if set(current) ^ set(updated): print('Changing biom: ', a.id, fp) t.update_ids({i: i.upper() for i in t.ids('observation')}, axis='observation', inplace=True) with biom_open(fp, 'w') as f: t.to_hdf5(f, t.generated_by) checksum = compute_checksum(fp) elif fpt == 'preprocessed_fasta': changed = False tmp = fp + '.tmp' with open(tmp, 'w') as out: for seq in read(fp, format='fasta'): seq = str(seq) sequ = seq.upper() out.write('>%s\n%s\n' % (sequ, sequ)) if seq != sequ: changed = True if changed: print('Changing biom: ', a.id, fp) rename(tmp, fp) checksum = compute_checksum(fp) else:
if exists(fpath): df = pd.read_csv(fpath, sep='\t', index_col=0, dtype=str, names=['filepath_id', 'checksum', 'fp_size']) cache = df.to_dict('index') for fid in fids: if fid not in cache: finfo = get_filepath_information(fid) try: size = getsize(finfo['fullpath']) except FileNotFoundError: size = 0 try: checksum = compute_checksum(finfo['fullpath']) except FileNotFoundError: checksum = '' else: checksum = cache[fid]['checksum'] size = cache[fid]['fp_size'] with TRN: sql = """UPDATE qiita.filepath SET fp_size = %s, checksum = %s WHERE filepath_id = %s""" TRN.add(sql, tuple([size, checksum, fid])) TRN.execute()
fpath = join(dirname(abspath(__file__)), 'support_files', 'patches', 'python_patches', '74.py.cache.tsv') cache = dict() if exists(fpath): df = pd.read_csv(fpath, sep='\t', index_col=0, dtype=str, names=['filepath_id', 'checksum', 'fp_size']) cache = df.to_dict('index') for fid in fids: if fid not in cache: finfo = get_filepath_information(fid) try: size = getsize(finfo['fullpath']) except FileNotFoundError: size = 0 try: checksum = compute_checksum(finfo['fullpath']) except FileNotFoundError: checksum = '' else: checksum = cache[fid]['checksum'] size = cache[fid]['fp_size'] with TRN: sql = """UPDATE qiita.filepath SET fp_size = %s, checksum = %s WHERE filepath_id = %s""" TRN.add(sql, tuple([size, checksum, fid])) TRN.execute()