Example #1
0
File: ebi.py Project: mcmk3/qiita
 def _rename_file(fp, new_fp):
     if fp.endswith('.gz'):
         copyfile(fp, new_fp)
     else:
         cmd = "gzip -c %s > %s" % (fp, new_fp)
         stdout, stderr, rv = system_call(cmd)
         if rv != 0:
             error_msg = ("Error:\nStd output:%s\nStd error:%s" %
                          (stdout, stderr))
             raise EBISubmissionError(error_msg)
Example #2
0
 def _rename_file(fp, new_fp):
     if fp.endswith('.gz'):
         copyfile(fp, new_fp)
     else:
         cmd = "gzip -c %s > %s" % (fp, new_fp)
         stdout, stderr, rv = system_call(cmd)
         if rv != 0:
             error_msg = (
                 "Error:\nStd output:%s\nStd error:%s"
                 % (stdout, stderr))
             raise EBISubmissionError(error_msg)
Example #3
0
    def _generate_demultiplexed_fastq_per_sample_FASTQ(self):
        """Modularity helper"""
        ar = self.artifact
        fps = [(basename(fp), fp) for _, fp, fpt in ar.filepaths
               if fpt == 'raw_forward_seqs']
        fps.sort(key=lambda x: x[1])
        if 'run_prefix' in self.prep_template.categories():
            rps = [(k, v) for k, v in viewitems(
                self.prep_template.get_category('run_prefix'))]
        else:
            rps = [(v, v.split('.', 1)[1]) for v in self.prep_template.keys()]
        rps.sort(key=lambda x: x[1])
        demux_samples = set()
        for sn, rp in rps:
            for i, (bn, fp) in enumerate(fps):
                if bn.startswith(rp):
                    demux_samples.add(sn)
                    new_fp = self.sample_demux_fps[sn]
                    if fp.endswith('.gz'):
                        copyfile(fp, new_fp)
                    else:
                        cmd = "gzip -c %s > %s" % (fp, new_fp)
                        stdout, stderr, rv = system_call(cmd)
                        if rv != 0:
                            error_msg = (
                                "Error:\nStd output:%s\nStd error:%s" %
                                (stdout, stderr))
                            raise EBISubmissionError(error_msg)
                    del fps[i]
                    break
        if fps:
            error_msg = (
                'Discrepancy between filepaths and sample names. Extra'
                ' filepaths: %s' % ', '.join([fp[0] for fp in fps]))
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)

        return demux_samples, \
            set(self.samples.keys()).difference(set(demux_samples))
Example #4
0
    def _generate_demultiplexed_fastq_per_sample_FASTQ(self):
        """Modularity helper"""
        ar = self.artifact
        fps = [(basename(fp), fp) for _, fp, fpt in ar.filepaths
               if fpt == 'raw_forward_seqs']
        fps.sort(key=lambda x: x[1])
        if 'run_prefix' in self.prep_template.categories():
            rps = [(k, v) for k, v in viewitems(
                self.prep_template.get_category('run_prefix'))]
        else:
            rps = [(v, v.split('.', 1)[1]) for v in self.prep_template.keys()]
        rps.sort(key=lambda x: x[1])
        demux_samples = set()
        for sn, rp in rps:
            for i, (bn, fp) in enumerate(fps):
                if bn.startswith(rp):
                    demux_samples.add(sn)
                    new_fp = self.sample_demux_fps[sn]
                    if fp.endswith('.gz'):
                        copyfile(fp, new_fp)
                    else:
                        cmd = "gzip -c %s > %s" % (fp, new_fp)
                        stdout, stderr, rv = system_call(cmd)
                        if rv != 0:
                            error_msg = (
                                "Error:\nStd output:%s\nStd error:%s"
                                % (stdout, stderr))
                            raise EBISubmissionError(error_msg)
                    del fps[i]
                    break
        if fps:
            error_msg = (
                'Discrepancy between filepaths and sample names. Extra'
                ' filepaths: %s' % ', '.join([fp[0] for fp in fps]))
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)

        return demux_samples, \
            set(self.samples.keys()).difference(set(demux_samples))
Example #5
0
def submit_EBI(artifact_id, action, send, test=False):
    """Submit an artifact to EBI

    Parameters
    ----------
    artifact_id : int
        The artifact id
    action : %s
        The action to perform with this data
    send : bool
        True to actually send the files
    test : bool
        If True some restrictions will be ignored, only used in parse_EBI_reply
    """
    # step 1: init and validate
    ebi_submission = EBISubmission(artifact_id, action)

    # step 2: generate demux fastq files
    try:
        ebi_submission.generate_demultiplexed_fastq()
    except Exception:
        error_msg = format_exc()
        if isdir(ebi_submission.full_ebi_dir):
            rmtree(ebi_submission.full_ebi_dir)
        LogEntry.create('Runtime',
                        error_msg,
                        info={'ebi_submission': artifact_id})
        raise

    # step 3: generate and write xml files
    ebi_submission.generate_xml_files()

    if send:
        # getting aspera's password
        old_ascp_pass = environ.get('ASPERA_SCP_PASS', '')
        if old_ascp_pass == '':
            environ['ASPERA_SCP_PASS'] = qiita_config.ebi_seq_xfer_pass
        ascp_passwd = environ['ASPERA_SCP_PASS']
        LogEntry.create('Runtime',
                        ('Submission of sequences of pre_processed_id: '
                         '%d completed successfully' % artifact_id))

        # step 4: sending sequences
        if action != 'MODIFY':
            LogEntry.create('Runtime',
                            ("Submitting sequences for pre_processed_id: "
                             "%d" % artifact_id))
            for cmd in ebi_submission.generate_send_sequences_cmd():
                stdout, stderr, rv = system_call(cmd)
                if rv != 0:
                    error_msg = ("ASCP Error:\nStd output:%s\nStd error:%s" %
                                 (stdout, stderr))
                    environ['ASPERA_SCP_PASS'] = old_ascp_pass
                    raise ComputeError(error_msg)
                open(ebi_submission.ascp_reply,
                     'a').write('stdout:\n%s\n\nstderr: %s' % (stdout, stderr))
        environ['ASPERA_SCP_PASS'] = old_ascp_pass

        # step 5: sending xml and parsing answer
        xmls_cmds = ebi_submission.generate_curl_command(
            ebi_seq_xfer_pass=ascp_passwd)
        LogEntry.create('Runtime', ("Submitting XMLs for pre_processed_id: "
                                    "%d" % artifact_id))
        xml_content, stderr, rv = system_call(xmls_cmds)
        if rv != 0:
            error_msg = ("Error:\nStd output:%s\nStd error:%s" %
                         (xml_content, stderr))
            raise ComputeError(error_msg)
        else:
            LogEntry.create('Runtime',
                            ('Submission of sequences of pre_processed_id: '
                             '%d completed successfully' % artifact_id))
        open(ebi_submission.curl_reply,
             'w').write('stdout:\n%s\n\nstderr: %s' % (xml_content, stderr))

        try:
            st_acc, sa_acc, bio_acc, ex_acc, run_acc = \
                ebi_submission.parse_EBI_reply(xml_content, test=test)
        except EBISubmissionError as e:
            error = str(e)
            le = LogEntry.create('Fatal',
                                 "Command: %s\nError: %s\n" %
                                 (xml_content, error),
                                 info={'ebi_submission': artifact_id})
            raise ComputeError("EBI Submission failed! Log id: %d\n%s" %
                               (le.id, error))

        if action == 'ADD' or test:
            if st_acc:
                ebi_submission.study.ebi_study_accession = st_acc
            if sa_acc:
                ebi_submission.sample_template.ebi_sample_accessions = sa_acc
            if bio_acc:
                ebi_submission.sample_template.biosample_accessions = bio_acc
            if ex_acc:
                ebi_submission.prep_template.ebi_experiment_accessions = ex_acc
            ebi_submission.artifact.ebi_run_accessions = run_acc
    else:
        st_acc, sa_acc, bio_acc, ex_acc, run_acc = None, None, None, None, None

    return st_acc, sa_acc, bio_acc, ex_acc, run_acc
Example #6
0
def submit_VAMPS(artifact_id):
    """Submit artifact to VAMPS

    Parameters
    ----------
    artifact_id : int
        The artifact id

    Raises
    ------
    ComputeError
        - If the artifact cannot be submitted to VAMPS
        - If the artifact is associated with more than one prep template
    """
    artifact = Artifact(artifact_id)
    if not artifact.can_be_submitted_to_vamps:
        raise ComputeError("Artifact %d cannot be submitted to VAMPS" %
                           artifact_id)
    study = artifact.study
    sample_template = study.sample_template
    prep_templates = artifact.prep_templates
    if len(prep_templates) > 1:
        raise ComputeError(
            "Multiple prep templates associated with the artifact: %s" %
            artifact_id)
    prep_template = prep_templates[0]

    # Also need to check that is not submitting (see item in #1523)
    if artifact.is_submitted_to_vamps:
        raise ValueError("Cannot resubmit artifact %s to VAMPS!" % artifact_id)

    # Generating a tgz
    targz_folder = mkdtemp(prefix=qiita_config.working_dir)
    targz_fp = join(targz_folder,
                    '%d_%d_%d.tgz' % (study.id, prep_template.id, artifact_id))
    targz = taropen(targz_fp, mode='w:gz')

    # adding sample/prep
    samp_fp = join(targz_folder, 'sample_metadata.txt')
    sample_template.to_file(samp_fp)
    targz.add(samp_fp, arcname='sample_metadata.txt')
    prep_fp = join(targz_folder, 'prep_metadata.txt')
    prep_template.to_file(prep_fp)
    targz.add(prep_fp, arcname='prep_metadata.txt')

    # adding preprocessed data
    for _, fp, fp_type in artifact.filepaths:
        if fp_type == 'preprocessed_fasta':
            targz.add(fp, arcname='preprocessed_fasta.fna')

    targz.close()

    # submitting
    cmd = ("curl -F user=%s -F pass='******' -F uploadFile=@%s -F "
           "press=UploadFile %s" %
           (qiita_config.vamps_user, qiita_config.vamps_pass, targz_fp,
            qiita_config.vamps_url))
    obs, stderr, rv = system_call(cmd)
    if rv != 0:
        error_msg = ("Error:\nStd output:%s\nStd error:%s" % (obs, stderr))
        raise ComputeError(error_msg)

    exp = ("<html>\n<head>\n<title>Process Uploaded File</title>\n</head>\n"
           "<body>\n</body>\n</html>")

    if obs != exp:
        return False
    else:
        artifact.is_submitted_to_vamps = True
        return True
Example #7
0
def submit_EBI(artifact_id, action, send, test=False, test_size=False):
    """Submit an artifact to EBI

    Parameters
    ----------
    artifact_id : int
        The artifact id
    action : %s
        The action to perform with this data
    send : bool
        True to actually send the files
    test : bool
        If True some restrictions will be ignored, only used in parse_EBI_reply
    test_size : bool
        If True the EBI-ENA restriction size will be changed to 6000
    """
    # step 1: init and validate
    ebi_submission = EBISubmission(artifact_id, action)

    # step 2: generate demux fastq files
    try:
        ebi_submission.generate_demultiplexed_fastq()
    except Exception:
        error_msg = format_exc()
        if isdir(ebi_submission.full_ebi_dir):
            rmtree(ebi_submission.full_ebi_dir)
        LogEntry.create('Runtime',
                        error_msg,
                        info={'ebi_submission': artifact_id})
        raise

    # step 3: generate and write xml files
    ebi_submission.generate_xml_files()

    # before we continue let's check the size of the submission
    to_review = [
        ebi_submission.study_xml_fp, ebi_submission.sample_xml_fp,
        ebi_submission.experiment_xml_fp, ebi_submission.run_xml_fp,
        ebi_submission.submission_xml_fp
    ]
    total_size = sum([stat(tr).st_size for tr in to_review if tr is not None])
    # note that the max for EBI is 10M but let's play it safe
    max_size = 10e+6 if not test_size else 5000
    if total_size > max_size:
        LogEntry.create(
            'Runtime', 'The submission: %d is larger than allowed (%d), will '
            'try to fix: %d' % (artifact_id, max_size, total_size))
        # transform current metadata to dataframe for easier curation
        rows = {k: dict(v) for k, v in ebi_submission.samples.items()}
        df = pd.DataFrame.from_dict(rows, orient='index')
        # remove unique columns and same value in all columns
        nunique = df.apply(pd.Series.nunique)
        nsamples = len(df.index)
        cols_to_drop = set(nunique[(nunique == 1) |
                                   (nunique == nsamples)].index)
        # maximize deletion by removing also columns that are almost all the
        # same or almost all unique
        cols_to_drop = set(nunique[(nunique <= int(nsamples * .01)) |
                                   (nunique >= int(nsamples * .5))].index)
        cols_to_drop = cols_to_drop - {
            'taxon_id', 'scientific_name', 'description'
        }
        all_samples = ebi_submission.sample_template.ebi_sample_accessions
        samples = [k for k in ebi_submission.samples if all_samples[k] is None]
        if samples:
            ebi_submission.write_xml_file(
                ebi_submission.generate_sample_xml(samples, cols_to_drop),
                ebi_submission.sample_xml_fp)

        # now let's recalculate the size to make sure it's fine
        new_total_size = sum(
            [stat(tr).st_size for tr in to_review if tr is not None])
        LogEntry.create(
            'Runtime', 'The submission: %d after cleaning is %d and was %d' %
            (artifact_id, total_size, new_total_size))
        if new_total_size > max_size:
            raise ComputeError(
                'Even after cleaning the submission: %d is too large. Before '
                'cleaning: %d, after: %d' %
                (artifact_id, total_size, new_total_size))

    st_acc, sa_acc, bio_acc, ex_acc, run_acc = None, None, None, None, None
    if send:
        # getting aspera's password
        old_ascp_pass = environ.get('ASPERA_SCP_PASS', '')
        if old_ascp_pass == '':
            environ['ASPERA_SCP_PASS'] = qiita_config.ebi_seq_xfer_pass
        ascp_passwd = environ['ASPERA_SCP_PASS']
        LogEntry.create('Runtime',
                        ('Submission of sequences of pre_processed_id: '
                         '%d completed successfully' % artifact_id))

        # step 4: sending sequences
        if action != 'MODIFY':
            LogEntry.create('Runtime',
                            ("Submitting sequences for pre_processed_id: "
                             "%d" % artifact_id))
            for cmd in ebi_submission.generate_send_sequences_cmd():
                stdout, stderr, rv = system_call(cmd)
                if rv != 0:
                    error_msg = ("ASCP Error:\nStd output:%s\nStd error:%s" %
                                 (stdout, stderr))
                    environ['ASPERA_SCP_PASS'] = old_ascp_pass
                    raise ComputeError(error_msg)
                open(ebi_submission.ascp_reply,
                     'a').write('stdout:\n%s\n\nstderr: %s' % (stdout, stderr))
        environ['ASPERA_SCP_PASS'] = old_ascp_pass

        # step 5: sending xml
        xmls_cmds = ebi_submission.generate_curl_command(
            ebi_seq_xfer_pass=ascp_passwd)
        LogEntry.create('Runtime', ("Submitting XMLs for pre_processed_id: "
                                    "%d" % artifact_id))
        xml_content, stderr, rv = system_call(xmls_cmds)
        if rv != 0:
            error_msg = ("Error:\nStd output:%s\nStd error:%s" %
                         (xml_content, stderr))
            raise ComputeError(error_msg)
        else:
            LogEntry.create('Runtime',
                            ('Submission of sequences of pre_processed_id: '
                             '%d completed successfully' % artifact_id))
        open(ebi_submission.curl_reply,
             'w').write('stdout:\n%s\n\nstderr: %s' % (xml_content, stderr))

        # parsing answer / only if adding
        if action == 'ADD' or test:
            try:
                st_acc, sa_acc, bio_acc, ex_acc, run_acc = \
                    ebi_submission.parse_EBI_reply(xml_content, test=test)
            except EBISubmissionError as e:
                error = str(e)
                le = LogEntry.create('Fatal',
                                     "Command: %s\nError: %s\n" %
                                     (xml_content, error),
                                     info={'ebi_submission': artifact_id})
                raise ComputeError("EBI Submission failed! Log id: %d\n%s" %
                                   (le.id, error))

            if st_acc:
                ebi_submission.study.ebi_study_accession = st_acc
            if sa_acc:
                ebi_submission.sample_template.ebi_sample_accessions = sa_acc
            if bio_acc:
                ebi_submission.sample_template.biosample_accessions = bio_acc
            if ex_acc:
                ebi_submission.prep_template.ebi_experiment_accessions = ex_acc
            ebi_submission.artifact.ebi_run_accessions = run_acc

    return st_acc, sa_acc, bio_acc, ex_acc, run_acc
Example #8
0
def submit_EBI(preprocessed_data_id, action, send):
    """Submit a preprocessed data to EBI

    Parameters
    ----------
    preprocessed_data_id : int
        The preprocesssed data id
    action : %s
        The action to perform with this data
    send : bool
        True to actually send the files
    """
    # step 1: init and validate
    ebi_submission = EBISubmission(preprocessed_data_id, action)

    # step 2: generate demux fastq files
    ebi_submission.study.ebi_submission_status = 'submitting'
    try:
        ebi_submission.generate_demultiplexed_fastq()
    except:
        error_msg = format_exc()
        if isdir(ebi_submission.full_ebi_dir):
            rmtree(ebi_submission.full_ebi_dir)
        ebi_submission.study.ebi_submission_status = 'failed: %s' % error_msg
        LogEntry.create('Runtime',
                        error_msg,
                        info={'ebi_submission': preprocessed_data_id})
        raise

    # step 3: generate and write xml files
    ebi_submission.generate_xml_files()

    if send:
        # step 4: sending sequences
        if action != 'MODIFY':
            old_ascp_pass = environ.get('ASPERA_SCP_PASS', '')
            environ['ASPERA_SCP_PASS'] = qiita_config.ebi_seq_xfer_pass

            LogEntry.create('Runtime',
                            ("Submitting sequences for pre_processed_id: "
                             "%d" % preprocessed_data_id))
            for cmd in ebi_submission.generate_send_sequences_cmd():
                stdout, stderr, rv = system_call(cmd)
                if rv != 0:
                    error_msg = ("Error:\nStd output:%s\nStd error:%s" %
                                 (stdout, stderr))
                    raise ComputeError(error_msg)
                open(ebi_submission.ascp_reply,
                     'a').write('stdout:\n%s\n\nstderr: %s' % (stdout, stderr))
            environ['ASPERA_SCP_PASS'] = old_ascp_pass
            LogEntry.create(
                'Runtime',
                ('Submission of sequences of pre_processed_id: '
                 '%d completed successfully' % preprocessed_data_id))

        # step 5: sending xml and parsing answer
        xmls_cmds = ebi_submission.generate_curl_command()
        LogEntry.create('Runtime', ("Submitting XMLs for pre_processed_id: "
                                    "%d" % preprocessed_data_id))
        xml_content, stderr, rv = system_call(xmls_cmds)
        if rv != 0:
            error_msg = ("Error:\nStd output:%s\nStd error:%s" %
                         (xml_content, stderr))
            raise ComputeError(error_msg)
        else:
            LogEntry.create(
                'Runtime',
                ('Submission of sequences of pre_processed_id: '
                 '%d completed successfully' % preprocessed_data_id))
        open(ebi_submission.curl_reply,
             'w').write('stdout:\n%s\n\nstderr: %s' % (xml_content, stderr))

        try:
            st_acc, sa_acc, bio_acc, ex_acc, run_acc = \
                ebi_submission.parse_EBI_reply(xml_content)
        except EBISubmissionError as e:
            le = LogEntry.create('Fatal',
                                 "Command: %s\nError: %s\n" %
                                 (xml_content, str(e)),
                                 info={'ebi_submission': preprocessed_data_id})
            ebi_submission.study.ebi_submission_status = (
                "failed: XML parsing, log id: %d" % le.id)
            raise ComputeError("EBI Submission failed! Log id: %d" % le.id)

        ebi_submission.study.ebi_submission_status = 'submitted'
        if action == 'ADD':
            if st_acc:
                ebi_submission.study.ebi_study_accession = st_acc
            if sa_acc:
                ebi_submission.sample_template.ebi_sample_accessions = sa_acc
            if bio_acc:
                ebi_submission.sample_template.biosample_accessions = bio_acc
            if ex_acc:
                ebi_submission.prep_template.ebi_experiment_accessions = ex_acc
            ebi_submission.artifact.ebi_run_accessions = run_acc
    else:
        st_acc, sa_acc, bio_acc, ex_acc, run_acc = None, None, None, None, None

    return st_acc, sa_acc, bio_acc, ex_acc, run_acc
Example #9
0
def submit_VAMPS(artifact_id):
    """Submit artifact to VAMPS

    Parameters
    ----------
    artifact_id : int
        The artifact id

    Raises
    ------
    ComputeError
        - If the artifact cannot be submitted to VAMPS
        - If the artifact is associated with more than one prep template
    """
    artifact = Artifact(artifact_id)
    if not artifact.can_be_submitted_to_vamps:
        raise ComputeError("Artifact %d cannot be submitted to VAMPS"
                           % artifact_id)
    study = artifact.study
    sample_template = study.sample_template
    prep_templates = artifact.prep_templates
    if len(prep_templates) > 1:
        raise ComputeError(
            "Multiple prep templates associated with the artifact: %s"
            % artifact_id)
    prep_template = prep_templates[0]

    # Also need to check that is not submitting (see item in #1523)
    if artifact.is_submitted_to_vamps:
        raise ValueError("Cannot resubmit artifact %s to VAMPS!" % artifact_id)

    # Generating a tgz
    targz_folder = mkdtemp(prefix=qiita_config.working_dir)
    targz_fp = join(targz_folder, '%d_%d_%d.tgz' % (study.id,
                                                    prep_template.id,
                                                    artifact_id))
    targz = taropen(targz_fp, mode='w:gz')

    # adding sample/prep
    samp_fp = join(targz_folder, 'sample_metadata.txt')
    sample_template.to_file(samp_fp)
    targz.add(samp_fp, arcname='sample_metadata.txt')
    prep_fp = join(targz_folder, 'prep_metadata.txt')
    prep_template.to_file(prep_fp)
    targz.add(prep_fp, arcname='prep_metadata.txt')

    # adding preprocessed data
    for _, fp, fp_type in artifact.filepaths:
        if fp_type == 'preprocessed_fasta':
            targz.add(fp, arcname='preprocessed_fasta.fna')

    targz.close()

    # submitting
    cmd = ("curl -F user=%s -F pass='******' -F uploadFile=@%s -F "
           "press=UploadFile %s" % (qiita_config.vamps_user,
                                    qiita_config.vamps_pass,
                                    targz_fp,
                                    qiita_config.vamps_url))
    obs, stderr, rv = system_call(cmd)
    if rv != 0:
        error_msg = ("Error:\nStd output:%s\nStd error:%s" % (obs, stderr))
        raise ComputeError(error_msg)

    exp = ("<html>\n<head>\n<title>Process Uploaded File</title>\n</head>\n"
           "<body>\n</body>\n</html>")

    if obs != exp:
        return False
    else:
        artifact.is_submitted_to_vamps = True
        return True
Example #10
0
def submit_EBI(artifact_id, action, send, test=False, test_size=False):
    """Submit an artifact to EBI

    Parameters
    ----------
    artifact_id : int
        The artifact id
    action : %s
        The action to perform with this data
    send : bool
        True to actually send the files
    test : bool
        If True some restrictions will be ignored, only used in parse_EBI_reply
    test_size : bool
        If True the EBI-ENA restriction size will be changed to 6000
    """
    # step 1: init and validate
    ebi_submission = EBISubmission(artifact_id, action)

    # step 2: generate demux fastq files
    try:
        ebi_submission.generate_demultiplexed_fastq()
    except Exception:
        error_msg = format_exc()
        if isdir(ebi_submission.full_ebi_dir):
            rmtree(ebi_submission.full_ebi_dir)
        LogEntry.create('Runtime', error_msg,
                        info={'ebi_submission': artifact_id})
        raise

    # step 3: generate and write xml files
    ebi_submission.generate_xml_files()

    # before we continue let's check the size of the submission
    to_review = [ebi_submission.study_xml_fp,
                 ebi_submission.sample_xml_fp,
                 ebi_submission.experiment_xml_fp,
                 ebi_submission.run_xml_fp,
                 ebi_submission.submission_xml_fp]
    total_size = sum([stat(tr).st_size for tr in to_review if tr is not None])
    # note that the max for EBI is 10M but let's play it safe
    max_size = 8.5e+6 if not test_size else 6000
    if total_size > max_size:
        LogEntry.create(
            'Runtime', 'The submission: %d is larger than allowed (%d), will '
            'try to fix: %d' % (artifact_id, max_size, total_size))
        # transform current metadata to dataframe for easier curation
        rows = {k: dict(v) for k, v in viewitems(ebi_submission.samples)}
        df = pd.DataFrame.from_dict(rows, orient='index')
        # remove unique columns and same value in all columns
        nunique = df.apply(pd.Series.nunique)
        nsamples = len(df.index)
        cols_to_drop = set(
            nunique[(nunique == 1) | (nunique == nsamples)].index)
        # maximize deletion by removing also columns that are almost all the
        # same or almost all unique
        cols_to_drop = set(
            nunique[(nunique <= int(nsamples * .01)) |
                    (nunique >= int(nsamples * .5))].index)
        cols_to_drop = cols_to_drop - {'taxon_id', 'scientific_name',
                                       'description'}
        all_samples = ebi_submission.sample_template.ebi_sample_accessions
        samples = {k: all_samples[k] for k in ebi_submission.samples}
        ebi_submission.write_xml_file(
            ebi_submission.generate_sample_xml(samples, cols_to_drop),
            ebi_submission.sample_xml_fp)

        # now let's recalculate the size to make sure it's fine
        new_total_size = sum([stat(tr).st_size
                              for tr in to_review if tr is not None])
        LogEntry.create(
            'Runtime', 'The submission: %d after cleaning is %d and was %d' % (
                artifact_id, total_size, new_total_size))
        if new_total_size > max_size:
            raise ComputeError(
                'Even after cleaning the submission: %d is too large. Before '
                'cleaning: %d, after: %d' % (
                    artifact_id, total_size, new_total_size))

    if send:
        # getting aspera's password
        old_ascp_pass = environ.get('ASPERA_SCP_PASS', '')
        if old_ascp_pass == '':
            environ['ASPERA_SCP_PASS'] = qiita_config.ebi_seq_xfer_pass
        ascp_passwd = environ['ASPERA_SCP_PASS']
        LogEntry.create('Runtime',
                        ('Submission of sequences of pre_processed_id: '
                         '%d completed successfully' % artifact_id))

        # step 4: sending sequences
        if action != 'MODIFY':
            LogEntry.create('Runtime',
                            ("Submitting sequences for pre_processed_id: "
                             "%d" % artifact_id))
            for cmd in ebi_submission.generate_send_sequences_cmd():
                stdout, stderr, rv = system_call(cmd)
                if rv != 0:
                    error_msg = ("ASCP Error:\nStd output:%s\nStd error:%s" % (
                        stdout, stderr))
                    environ['ASPERA_SCP_PASS'] = old_ascp_pass
                    raise ComputeError(error_msg)
                open(ebi_submission.ascp_reply, 'a').write(
                    'stdout:\n%s\n\nstderr: %s' % (stdout, stderr))
        environ['ASPERA_SCP_PASS'] = old_ascp_pass

        # step 5: sending xml and parsing answer
        xmls_cmds = ebi_submission.generate_curl_command(
            ebi_seq_xfer_pass=ascp_passwd)
        LogEntry.create('Runtime',
                        ("Submitting XMLs for pre_processed_id: "
                         "%d" % artifact_id))
        xml_content, stderr, rv = system_call(xmls_cmds)
        if rv != 0:
            error_msg = ("Error:\nStd output:%s\nStd error:%s" % (
                xml_content, stderr))
            raise ComputeError(error_msg)
        else:
            LogEntry.create('Runtime',
                            ('Submission of sequences of pre_processed_id: '
                             '%d completed successfully' % artifact_id))
        open(ebi_submission.curl_reply, 'w').write(
            'stdout:\n%s\n\nstderr: %s' % (xml_content, stderr))

        try:
            st_acc, sa_acc, bio_acc, ex_acc, run_acc = \
                ebi_submission.parse_EBI_reply(xml_content, test=test)
        except EBISubmissionError as e:
            error = str(e)
            le = LogEntry.create(
                'Fatal', "Command: %s\nError: %s\n" % (xml_content, error),
                info={'ebi_submission': artifact_id})
            raise ComputeError(
                "EBI Submission failed! Log id: %d\n%s" % (le.id, error))

        if action == 'ADD' or test:
            if st_acc:
                ebi_submission.study.ebi_study_accession = st_acc
            if sa_acc:
                ebi_submission.sample_template.ebi_sample_accessions = sa_acc
            if bio_acc:
                ebi_submission.sample_template.biosample_accessions = bio_acc
            if ex_acc:
                ebi_submission.prep_template.ebi_experiment_accessions = ex_acc
            ebi_submission.artifact.ebi_run_accessions = run_acc
    else:
        st_acc, sa_acc, bio_acc, ex_acc, run_acc = None, None, None, None, None

    return st_acc, sa_acc, bio_acc, ex_acc, run_acc
Example #11
0
def submit_EBI(artifact_id, action, send, test=False):
    """Submit an artifact to EBI

    Parameters
    ----------
    artifact_id : int
        The artifact id
    action : %s
        The action to perform with this data
    send : bool
        True to actually send the files
    test : bool
        If True some restrictions will be ignored, only used in parse_EBI_reply
    """
    # step 1: init and validate
    ebi_submission = EBISubmission(artifact_id, action)

    # step 2: generate demux fastq files
    try:
        ebi_submission.generate_demultiplexed_fastq()
    except Exception:
        error_msg = format_exc()
        if isdir(ebi_submission.full_ebi_dir):
            rmtree(ebi_submission.full_ebi_dir)
        LogEntry.create('Runtime', error_msg,
                        info={'ebi_submission': artifact_id})
        raise

    # step 3: generate and write xml files
    ebi_submission.generate_xml_files()

    if send:
        # getting aspera's password
        old_ascp_pass = environ.get('ASPERA_SCP_PASS', '')
        if old_ascp_pass == '':
            environ['ASPERA_SCP_PASS'] = qiita_config.ebi_seq_xfer_pass
        ascp_passwd = environ['ASPERA_SCP_PASS']
        LogEntry.create('Runtime',
                        ('Submission of sequences of pre_processed_id: '
                         '%d completed successfully' % artifact_id))

        # step 4: sending sequences
        if action != 'MODIFY':
            LogEntry.create('Runtime',
                            ("Submitting sequences for pre_processed_id: "
                             "%d" % artifact_id))
            for cmd in ebi_submission.generate_send_sequences_cmd():
                stdout, stderr, rv = system_call(cmd)
                if rv != 0:
                    error_msg = ("ASCP Error:\nStd output:%s\nStd error:%s" % (
                        stdout, stderr))
                    environ['ASPERA_SCP_PASS'] = old_ascp_pass
                    raise ComputeError(error_msg)
                open(ebi_submission.ascp_reply, 'a').write(
                    'stdout:\n%s\n\nstderr: %s' % (stdout, stderr))
        environ['ASPERA_SCP_PASS'] = old_ascp_pass

        # step 5: sending xml and parsing answer
        xmls_cmds = ebi_submission.generate_curl_command(
            ebi_seq_xfer_pass=ascp_passwd)
        LogEntry.create('Runtime',
                        ("Submitting XMLs for pre_processed_id: "
                         "%d" % artifact_id))
        xml_content, stderr, rv = system_call(xmls_cmds)
        if rv != 0:
            error_msg = ("Error:\nStd output:%s\nStd error:%s" % (
                xml_content, stderr))
            raise ComputeError(error_msg)
        else:
            LogEntry.create('Runtime',
                            ('Submission of sequences of pre_processed_id: '
                             '%d completed successfully' % artifact_id))
        open(ebi_submission.curl_reply, 'w').write(
            'stdout:\n%s\n\nstderr: %s' % (xml_content, stderr))

        try:
            st_acc, sa_acc, bio_acc, ex_acc, run_acc = \
                ebi_submission.parse_EBI_reply(xml_content, test=test)
        except EBISubmissionError as e:
            error = str(e)
            le = LogEntry.create(
                'Fatal', "Command: %s\nError: %s\n" % (xml_content, error),
                info={'ebi_submission': artifact_id})
            raise ComputeError(
                "EBI Submission failed! Log id: %d\n%s" % (le.id, error))

        if action == 'ADD' or test:
            if st_acc:
                ebi_submission.study.ebi_study_accession = st_acc
            if sa_acc:
                ebi_submission.sample_template.ebi_sample_accessions = sa_acc
            if bio_acc:
                ebi_submission.sample_template.biosample_accessions = bio_acc
            if ex_acc:
                ebi_submission.prep_template.ebi_experiment_accessions = ex_acc
            ebi_submission.artifact.ebi_run_accessions = run_acc
    else:
        st_acc, sa_acc, bio_acc, ex_acc, run_acc = None, None, None, None, None

    return st_acc, sa_acc, bio_acc, ex_acc, run_acc