Example #1
0
def calculate(finfo):
    try:
        size = getsize(finfo['fullpath'])
    except (FileNotFoundError, PermissionError):
        return finfo, None, None

    checksum = compute_checksum(finfo['fullpath'])

    return finfo['filepath_id'], checksum, size
Example #2
0
def calculate(finfo):
    try:
        size = getsize(finfo['fullpath'])
    except (FileNotFoundError, PermissionError):
        return finfo, None, None

    checksum = compute_checksum(finfo['fullpath'])

    return finfo['filepath_id'], checksum, size
Example #3
0
    def test_update_raw_data_from_cmd_rd_id(self):
        rd = update_raw_data_from_cmd(self.filepaths, self.filepaths_types,
                                      self.study.id, self.study.raw_data()[0])
        # Make sure that we are cleaning the environment
        for _, fp, _ in rd.get_filepaths():
            self._clean_up_files.append(fp)

        # The checkums are in filepath order. If we sort the rd.get_filepath()
        # result by the filepath (itemgetter(1)) we will get them in the same
        # order, so the checksums will not fail
        for obs, exp in zip(sorted(rd.get_filepaths(), key=itemgetter(1)),
                            self.checksums):
            self.assertEqual(compute_checksum(obs[1]), exp)
Example #4
0
    def setUp(self):
        fd, seqs_fp = mkstemp(suffix='_seqs.fastq')
        close(fd)
        fd, barcodes_fp = mkstemp(suffix='_barcodes.fastq')
        close(fd)
        self.filepaths = [seqs_fp, barcodes_fp]
        self.checksums = []
        for fp in sorted(self.filepaths):
            with open(fp, 'w') as f:
                f.write("%s\n" % fp)
            self.checksums.append(compute_checksum(fp))
        self.filepaths_types = ["raw_forward_seqs", "raw_barcodes"]
        self._clean_up_files = [seqs_fp, barcodes_fp]

        info = {
            "timeseries_type_id": 1,
            "metadata_complete": True,
            "mixs_compliant": True,
            "number_samples_collected": 25,
            "number_samples_promised": 28,
            "study_alias": "FCM",
            "study_description": "Microbiome of people who eat nothing but "
                                 "fried chicken",
            "study_abstract": "Exploring how a high fat diet changes the "
                              "gut microbiome",
            "emp_person_id": StudyPerson(2),
            "principal_investigator_id": StudyPerson(3),
            "lab_person_id": StudyPerson(1)
        }
        self.new_study = Study.create(User("*****@*****.**"),
                                      "Update raw data test",
                                      efo=[1], info=info)
        self.study = Study(1)
        # The files for the RawData object attached to study 1 does not exist.
        # Create them so we can actually perform the tests
        for _, fp, _ in RawData(1).get_filepaths():
            with open(fp, 'w') as f:
                f.write('\n')
            self._clean_up_files.append(fp)

        self.uploaded_files = get_files_from_uploads_folders(
            str(self.study.id))
Example #5
0
 def test_compute_checksum(self):
     """Correctly returns the file checksum"""
     obs = compute_checksum(self.filepath)
     exp = 1719580229
     self.assertEqual(obs, exp)
Example #6
0
 def test_compute_checksum(self):
     """Correctly returns the file checksum"""
     obs = compute_checksum(self.filepath)
     exp = 1719580229
     self.assertEqual(obs, exp)
Example #7
0
File: 54.py Project: tkosciol/qiita
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update({sid: "%d.%s" % (a_id, sid)
                                for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'],
                      4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [basename(new_table_fp), 7,
                      compute_checksum(new_table_fp), 1, dd_id])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Example #8
0
File: 54.py Project: tanaes/qiita
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update(
                    {sid: "%d.%s" % (a_id, sid)
                     for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(
            sql,
            [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [
            basename(new_table_fp), 7,
            compute_checksum(new_table_fp), 1, dd_id
        ])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Example #9
0
                        for vv in v:
                            to_merge[vv] = k
                merge_fn = (lambda id_, x: to_merge[id_]
                            if id_ in to_merge else id_)
                t = t.collapse(merge_fn, norm=False, min_group_size=1,
                               axis='observation', collapse_f=collapse_f)
            else:
                ids_to_replace = {c: c.upper() for c in current
                                  if c != c.upper()}

            t.update_ids(ids_to_replace, axis='observation', strict=False,
                         inplace=True)

            with biom_open(biom, 'w') as f:
                t.to_hdf5(f, t.generated_by)
            checksum = compute_checksum(biom)

            TRN.add(sql, [checksum, ftps['biom'][0]])

            fna = ftps['preprocessed_fasta'][1]
            tmp = fna + '.tmp'
            with open(tmp, 'w') as out:
                for seq in t.ids('observation'):
                    out.write('>%s\n%s\n' % (seq, seq))
            rename(tmp, fna)
            checksum = compute_checksum(fna)

            TRN.add(sql, [checksum, ftps['preprocessed_fasta'][0]])

            TRN.execute()
Example #10
0
 # it nicely
 with TRN:
     checksum = None
     if fpt == 'biom':
         t = load_table(fp)
         current = t.ids('observation')
         updated = map(lambda x: x.upper(), current)
         if len(set(updated)) != len(updated):
             print('************>', a.id, fp, '<**************')
         if set(current) ^ set(updated):
             print('Changing biom: ', a.id, fp)
             t.update_ids({i: i.upper() for i in t.ids('observation')},
                          axis='observation', inplace=True)
             with biom_open(fp, 'w') as f:
                 t.to_hdf5(f, t.generated_by)
             checksum = compute_checksum(fp)
     elif fpt == 'preprocessed_fasta':
         changed = False
         tmp = fp + '.tmp'
         with open(tmp, 'w') as out:
             for seq in read(fp, format='fasta'):
                 seq = str(seq)
                 sequ = seq.upper()
                 out.write('>%s\n%s\n' % (sequ, sequ))
                 if seq != sequ:
                     changed = True
         if changed:
             print('Changing biom: ', a.id, fp)
             rename(tmp, fp)
             checksum = compute_checksum(fp)
         else:
Example #11
0
if exists(fpath):
    df = pd.read_csv(fpath,
                     sep='\t',
                     index_col=0,
                     dtype=str,
                     names=['filepath_id', 'checksum', 'fp_size'])
    cache = df.to_dict('index')

for fid in fids:
    if fid not in cache:
        finfo = get_filepath_information(fid)
        try:
            size = getsize(finfo['fullpath'])
        except FileNotFoundError:
            size = 0

        try:
            checksum = compute_checksum(finfo['fullpath'])
        except FileNotFoundError:
            checksum = ''
    else:
        checksum = cache[fid]['checksum']
        size = cache[fid]['fp_size']

    with TRN:
        sql = """UPDATE qiita.filepath
                SET fp_size = %s, checksum = %s
                WHERE filepath_id = %s"""
        TRN.add(sql, tuple([size, checksum, fid]))
        TRN.execute()
Example #12
0
File: 74.py Project: antgonza/qiita
fpath = join(dirname(abspath(__file__)), 'support_files', 'patches',
             'python_patches', '74.py.cache.tsv')
cache = dict()
if exists(fpath):
    df = pd.read_csv(fpath, sep='\t', index_col=0, dtype=str,
                     names=['filepath_id', 'checksum', 'fp_size'])
    cache = df.to_dict('index')

for fid in fids:
    if fid not in cache:
        finfo = get_filepath_information(fid)
        try:
            size = getsize(finfo['fullpath'])
        except FileNotFoundError:
            size = 0

        try:
            checksum = compute_checksum(finfo['fullpath'])
        except FileNotFoundError:
            checksum = ''
    else:
        checksum = cache[fid]['checksum']
        size = cache[fid]['fp_size']

    with TRN:
        sql = """UPDATE qiita.filepath
                SET fp_size = %s, checksum = %s
                WHERE filepath_id = %s"""
        TRN.add(sql, tuple([size, checksum, fid]))
        TRN.execute()