Ejemplo n.º 1
0
    def create_status_json_file(self):
        """Create [stage name]_status.json, which will include step-level job status updates to be used in idseq-web."""
        log.write(f"Creating {self.step_status_local}")

        with open(self.step_status_local, 'w') as status_file:
            json.dump({}, status_file)
        idseq_dag.util.s3.upload_with_retries(self.step_status_local, self.output_dir_s3 + "/")
Ejemplo n.º 2
0
    def sync_pairs(fastq_files, max_discrepant_fraction=0):
        """The given fastq_files contain the same read IDs but in different order.
        Output the same data in synchronized order. Omit any reads missing their mate
        up to max_discrepant_fraction if necessary. If more must be suppressed,
        indicate it in the second value of the returned tuple.
        """
        if len(fastq_files) != 2:
            return fastq_files, False

        output_fnames = [ifn + ".synchronized_pairs.fq" for ifn in fastq_files]
        with open(fastq_files[0], "rb") as if_0, open(fastq_files[1],
                                                      "rb") as if_1:
            with open(output_fnames[0], "wb") as of_0, open(
                    output_fnames[1], "wb") as of_1:
                outstanding_r0, outstanding_r1, max_mem, total = PipelineStepRunStar.sync_pairs_work(
                    of_0, of_1, if_0, if_1)
        if max_mem:
            # This will be printed if some pairs were out of order.
            msg = "WARNING: Pair order out of sync in {fqf}. Synchronized using RAM for {max_mem} pairs."
            msg = msg.format(fqf=fastq_files, max_mem=max_mem)
            log.write(msg)

        discrepancies_count = len(outstanding_r0) + len(outstanding_r1)
        if discrepancies_count:
            msg = "WARNING: Found {dc} broken pairs in {fqf}, e.g., {example}."
            msg = msg.format(dc=discrepancies_count,
                             fqf=fastq_files,
                             example=(outstanding_r0 or outstanding_r1).popitem()[0])
            log.write(msg)
        too_discrepant = (discrepancies_count > max_discrepant_fraction * total)
        return output_fnames, too_discrepant
Ejemplo n.º 3
0
def parse_tsv(path,
              schema,
              expect_headers=False,
              raw_lines=False,
              strict=True):
    '''
    Parse TSV file with given schema, yielding a dict per line.
    See BLAST_OUTPUT_SCHEMA, for example.
    When expect_headers=True, treat the first line as column headers.
    When strict mode is True, all columns in schema are required. When strict mode is False, will return None values for column not found.
    '''
    assert expect_headers == False, "Headers not yet implemented."
    schema_items = list(schema.items())
    with open(path, "r") as stream:
        for line_number, raw_line in enumerate(stream):
            try:
                row = raw_line.rstrip("\n").split("\t")
                if strict and len(row) != len(schema):
                    msg = f"{path}:{line_number + 1}:  Parse error.  Input line does not conform to schema: {schema}"
                    log.write(msg, warning=True)
                length = min(len(row), len(schema))
                row_dict = {
                    cname: ctype(vstr)
                    for vstr, (
                        cname,
                        ctype) in zip(row[0:length], schema_items[0:length])
                }
            except:
                msg = f"{path}:{line_number + 1}:  Parse error.  Input does not conform to schema: {schema}"
                log.write(msg, warning=True)
                raise
            if raw_lines:
                yield row_dict, raw_line
            else:
                yield row_dict
Ejemplo n.º 4
0
    def update_status_json_file(self, status):
        log.write(
            f"Updating status file for step {self.name} with status {status}")
        # First, update own status dictionary
        if "description" not in self.status_dict:
            self.status_dict["description"] = self.step_description()
        if "resources" not in self.status_dict:
            self.status_dict["resources"] = self.step_resources()
        if "start_time" not in self.status_dict and status == "running":
            self.status_dict["start_time"] = time.time()  # seconds since epoch

        self.status_dict["status"] = status
        if self.input_file_error:
            self.status_dict["error"] = self.input_file_error.name
        if status == "uploaded":
            self.status_dict["end_time"] = time.time()

        self.status_dict["additional_output"] = \
            [self.relative_path(f) for f in self.additional_output_files_visible]

        # Then, update file by reading the json, modifying, and overwriting.
        with self.step_status_lock:
            with open(self.step_status_local, 'r') as status_file:
                status = json.load(status_file)
            status.update({self.name: self.status_dict})
            with open(self.step_status_local, 'w') as status_file:
                json.dump(status, status_file)
            idseq_dag.util.s3.upload_with_retries(self.step_status_local,
                                                  self.output_dir_s3 + "/")
Ejemplo n.º 5
0
 def get_sequence_for_thread(error_flags,
                             accession_info,
                             accession_id,
                             nt_loc_dict,
                             nt_bucket,
                             nt_key,
                             semaphore,
                             mutex,
                             seq_count=[0]):  #pylint: disable=dangerous-default-value
     try:
         ref_seq_len, seq_name, accession_file = PipelineStepGenerateAlignmentViz.get_sequence_by_accession_id_s3(
             accession_id, nt_loc_dict, nt_bucket, nt_key)
         with mutex:
             accession_info['seq_file'] = accession_file
             accession_info['ref_seq_len'] = ref_seq_len
             accession_info['name'] = seq_name
             seq_count[0] += 1
             if seq_count[0] % 100 == 0:
                 msg = f"{seq_count[0]} sequences fetched, most recently " \
                       f"{accession_id}"
                 log.write(msg)
     except:
         with mutex:
             if not error_flags:
                 traceback.print_exc()
             error_flags["error"] = 1
     finally:
         semaphore.release()
Ejemplo n.º 6
0
    def sync_pairs(fastq_files, max_discrepancies=0):
        """The given fastq_files contain the same read IDs but in different order.
        Output the same data in synchronized order. Omit up to max_discrepancies
        if necessary. If more must be suppressed, raise assertion.
        """
        if len(fastq_files) != 2:
            return fastq_files

        output_fnames = [ifn + ".synchronized_pairs.fq" for ifn in fastq_files]
        with open(fastq_files[0], "rb") as if_0, open(fastq_files[1],
                                                      "rb") as if_1:
            with open(output_fnames[0],
                      "wb") as of_0, open(output_fnames[1], "wb") as of_1:
                outstanding_r0, outstanding_r1, max_mem = PipelineStepRunStar.sync_pairs_work(
                    of_0, of_1, if_0, if_1)
        if max_mem:
            # This will be printed if some pairs were out of order.
            msg = "WARNING: Pair order out of sync in {fqf}. " \
                              "Synchronized using RAM for {max_mem} pairs.".format(
                fqf=fastq_files, max_mem=max_mem)
            log.write(msg)

        discrepancies_count = len(outstanding_r0) + len(outstanding_r1)
        if discrepancies_count:
            msg = "WARNING: Found {dc} broken pairs in {fqf}, e.g., " \
                              "{example}.".format(
                dc=discrepancies_count,
                fqf=fastq_files,
                example=(outstanding_r0 or outstanding_r1).popitem()[0])
            log.write(msg)
            assert discrepancies_count <= max_discrepancies, msg
        return output_fnames
Ejemplo n.º 7
0
    def get_taxid_mapping_for_batch(taxids,
                                    taxid2wikidict,
                                    mutex,
                                    semaphore,
                                    max_attempt=3):
        ''' Get wiki mapping for a list of taxids '''
        taxid_str = ",".join(taxids)
        log.write(f"fetching batch {taxid_str}")
        for attempt in range(max_attempt):
            try:
                handle = Entrez.elink(dbfrom="taxonomy",
                                      id=taxid_str,
                                      cmd="llinks")
                record = Entrez.read(handle)
                handle.close()

                parsed = {}
                results = record[0]['IdUrlList']['IdUrlSet']
                for result in results:
                    taxid = result['Id']
                    wikiurl = ""
                    for link in result['ObjUrl']:
                        url = str(link['Url'])
                        if re.search('wikipedia.org', url):
                            wikiurl = url
                            break
                    parsed[taxid] = wikiurl
                break
            except:
                log.write(f"failed batch attempt {attempt}")
                time.sleep(5)
        semaphore.release()
        with mutex:
            taxid2wikidict.update(parsed)
Ejemplo n.º 8
0
    def run(self):
        """
          Generate GSNAP index. To be called from idseq-infra
        """
        nt_db = self.input_files_local[0][0]
        output_nt_index_tar = self.output_files_local()[0]
        output_nt_index_parent_dir = os.path.dirname(output_nt_index_tar)
        output_tar_base = os.path.basename(output_nt_index_tar)
        output_nt_index_dir_base = output_tar_base[:-4]
        k = self.additional_attributes.get("k", 16)  # kmer k
        log.write(f"input: {nt_db} output: {output_nt_index_tar}")
        command.execute(
            command_patterns.SingleCommand(cmd="gmap_build",
                                           args=[
                                               "-D",
                                               output_nt_index_parent_dir,
                                               "-d", output_nt_index_dir_base,
                                               "-k", k, nt_db
                                           ]))

        output_nt_index_dir = os.path.join(output_nt_index_parent_dir,
                                           output_nt_index_dir_base)
        self.additional_output_folders_hidden.append(output_nt_index_dir)

        command.execute(
            command_patterns.SingleCommand(
                cd=output_nt_index_parent_dir,
                cmd="tar",
                args=["cvf", output_tar_base, output_nt_index_dir_base]))
    def fetch_target_from_s3(self, target):
        ''' .done file should be written to the result dir when the download is complete '''
        log.write("Downloading target %s" % target)
        if target in self.given_targets:
            input_path_s3 = self.given_targets[target]["s3_dir"]
        else:
            input_path_s3 = self.output_dir_s3

        with log.log_context("fetch_input_files_from_s3", {"target": target}):
            PipelineFlow.fetch_input_files_from_s3(
                input_files=self.targets[target],
                input_dir_s3=input_path_s3,
                result_dir_local=self.output_dir_local)

        if target in self.given_targets and self.given_targets[target].get(
                "count_reads"):
            with log.log_context("count_input_reads", {"target": target}):
                try:
                    PipelineFlow.count_input_reads(
                        input_files=self.targets[target],
                        result_dir_local=self.output_dir_local,
                        result_dir_s3=self.output_dir_s3,
                        target_name=target,
                        max_fragments=self.given_targets[target]
                        ["max_fragments"])
                except AssertionError as e:
                    # The counting methods may raise assertion errors if assumptions
                    # about input format are not satisfied.
                    self.write_invalid_input_json({
                        "error": str(e),
                        "step": None
                    })
Ejemplo n.º 10
0
 def multilinefa2singlelinefa(input_fasta, output_fasta):
     ''' Multi-line FASTA to Single-line FASTA conversion '''
     step = "Multi-line FASTA to single-line FASTA conversion"
     log.write(f"Starting {step}...")
     cmd = f"awk 'NR==1 {{print $0}} NR>1 && /^>/ {{printf(\"\\n%s\\n\",$0);next; }} NR>1 {{ printf(\"%s\",$0);}}  END {{printf(\"\\n\");}}' <{input_fasta} > {output_fasta}"
     command.execute(cmd)
     log.write(f"Finished {step}.")
Ejemplo n.º 11
0
 def get_accession_metadata(accession):
     '''
     Retrieve metadata of an NCBI accession (e.g. name, country, collection date)
     TODO: Put this data in S3 instead and get it from there.
     '''
     accession_metadata = {}
     efetch_command = ";".join([
         f"QUERY={accession}",
         "BASE=https://eutils.ncbi.nlm.nih.gov/entrez/eutils",
         "SEARCH_URL=${BASE}/esearch.fcgi?db=nuccore\&term=${QUERY}\&usehistory=y",
         "OUTPUT=$(curl $SEARCH_URL)",
         "WEB=$(echo $OUTPUT | sed -e 's/.*<WebEnv>\(.*\)<\/WebEnv>.*/\\1/')",
         "KEY=$(echo $OUTPUT | sed -e 's/.*<QueryKey>\(.*\)<\/QueryKey>.*/\\1/')",
         "FETCH_URL=${BASE}/efetch.fcgi?db=nuccore\&query_key=${KEY}\&WebEnv=${WEB}\&rettype=gb\&retmode=xml",
         f"curl $FETCH_URL"
     ])
     genbank_xml = command.execute_with_output(efetch_command)
     root = ET.fromstring(genbank_xml).find('GBSeq')
     if not root:
         log.write(f"WARNING: {efetch_command} did not give a result")
         return accession_metadata
     accession_metadata['name'] = root.find('GBSeq_definition').text
     qualifiers_needed = {'country', 'collection_date'}
     for entry in root.find('GBSeq_feature-table')[0].find('GBFeature_quals'):
         if all(key in accession_metadata for key in qualifiers_needed):
             break
         for key in qualifiers_needed - accession_metadata.keys():
             if entry.find('GBQualifier_name').text == key:
                 accession_metadata[key] = entry.find('GBQualifier_value').text
     return accession_metadata
Ejemplo n.º 12
0
    def subsample_fastas(input_fas, output_fas, max_fragments):
        ''' In memory subsampling '''
        paired = len(input_fas) >= 2
        # count lines
        cmd = "wc -l %s | cut -f1 -d ' '" % input_fas[0]
        total_records = int(command.execute_with_output(cmd)) // 2
        log.write("total reads: %d" % total_records)
        log.write("target reads: %d" % max_fragments)
        if total_records <= max_fragments:
            for infile, outfile in zip(input_fas, output_fas):
                command.execute("cp %s %s" % (infile, outfile))
            return

        # total_records > max_fragments, sample
        randgen = random.Random(x=hash(input_fas[0]))
        records_to_keep = randgen.sample(range(total_records), max_fragments)
        PipelineStepRunSubsample.subset(input_fas[0], output_fas[0],
                                        records_to_keep)
        if paired:
            PipelineStepRunSubsample.subset(input_fas[1], output_fas[1],
                                            records_to_keep)
            if len(input_fas) == 3 and len(output_fas) == 3:
                # subset the merged fasta
                records_to_keep_merged = []
                for r in records_to_keep:
                    records_to_keep_merged += [2 * r, 2 * r + 1]
                PipelineStepRunSubsample.subset(input_fas[2], output_fas[2],
                                                records_to_keep_merged)
Ejemplo n.º 13
0
    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)

        genome_dir = fetch_reference(self.additional_files["gsnap_genome"],
                                     self.ref_dir_local,
                                     allow_s3mi=True,
                                     auto_untar=True)
        gsnap_base_dir = os.path.dirname(genome_dir)
        gsnap_index_name = os.path.basename(genome_dir)
        # Run Gsnap
        gsnap_params = [
            '-A', 'sam', '--batch=0', '--use-shared-memory=0',
            '--gmap-mode=all', '--npaths=1', '--ordered', '-t', 32,
            '--max-mismatches=40', '-D', gsnap_base_dir, '-d',
            gsnap_index_name, '-o', output_sam_file
        ] + input_fas
        command.execute(
            command_patterns.SingleCommand(cmd='gsnapl', args=gsnap_params))
        log.write("Finished GSNAP alignment.")

        # Extract out unmapped files from sam
        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Ejemplo n.º 14
0
    def run_locally(self, input_fas, output_m8):
        with tempfile.TemporaryDirectory(
                prefix=self.alignment_algorithm) as tmpdir:
            index_path = os.path.join(tmpdir, "reference")
            os.mkdir(index_path)
            run(["tar", "-xzvf", self.index, "-C", index_path], check=True)

            if self.alignment_algorithm == "gsnap":
                # Hack to determine gsnap vs gsnapl
                error_message = run([
                    'gsnapl.avx2-2018-10-26', '-D', index_path, '-d',
                    self.genome_name
                ],
                                    input='>'.encode('utf-8'),
                                    stderr=PIPE,
                                    stdout=PIPE).stderr
                # note, alignment uses a pinned version of gmap/gsnap
                gsnap_command = "gsnap.avx2-2018-10-26" if 'please run gsnap instead' in error_message.decode(
                    'utf-8') else "gsnapl.avx2-2018-10-26"
            else:
                gsnap_command = None

            # rapsearch2 expects the filename of the primary file, but still depends on the .info file being a sibbling
            if self.alignment_algorithm == "rapsearch2":
                for filename in os.listdir(index_path):
                    if not filename.endswith(".info"):
                        index_path = os.path.join(index_path, filename)

            cmd = self._get_command(index_path,
                                    input_fas,
                                    output_m8,
                                    threads=multiprocessing.cpu_count(),
                                    gsnap_command=gsnap_command)
            log.write(f"running command {cmd}")
            run(cmd, check=True)
Ejemplo n.º 15
0
 def fq2fa(input_fastq, output_fasta):
     ''' FASTQ to FASTA conversion '''
     step = "FASTQ to FASTA conversion"
     log.write(f"Starting {step}...")
     cmd = f"sed -n '1~4s/^@/>/p;2~4p' <{input_fastq} >{output_fasta}"
     command.execute(cmd)
     log.write(f"Finished {step}.")
Ejemplo n.º 16
0
    def subsample_fastas(input_fas, output_fas, max_fragments):
        ''' In memory subsampling '''
        paired = len(input_fas) >= 2
        # count lines
        cmd_output = command.execute_with_output(
            command_patterns.SingleCommand(cmd="wc", args=["-l",
                                                           input_fas[0]]))
        lines_count = int(cmd_output.strip().split(' ')[0])
        total_records = lines_count // 2
        log.write("total reads: %d" % total_records)
        log.write("target reads: %d" % max_fragments)
        if total_records <= max_fragments:
            for infile, outfile in zip(input_fas, output_fas):
                command.copy_file(infile, outfile)
            return

        # total_records > max_fragments, sample
        randgen = random.Random(x=hash(input_fas[0]))
        records_to_keep = randgen.sample(range(total_records), max_fragments)
        PipelineStepRunSubsample.subset(input_fas[0], output_fas[0],
                                        records_to_keep)
        if paired:
            PipelineStepRunSubsample.subset(input_fas[1], output_fas[1],
                                            records_to_keep)
            if len(input_fas) == 3 and len(output_fas) == 3:
                # subset the merged fasta
                records_to_keep_merged = []
                for r in records_to_keep:
                    records_to_keep_merged += [2 * r, 2 * r + 1]
                PipelineStepRunSubsample.subset(input_fas[2], output_fas[2],
                                                records_to_keep_merged)
Ejemplo n.º 17
0
 def manage_reference_downloads_cache(self):
     command.make_dirs(PURGE_SENTINEL_DIR)
     command.touch(PURGE_SENTINEL)
     time.sleep(
         3
     )  # 1 should be enough for the sentinel to predate all current stage downloads
     present_set, missing_set = self.references_roll_call()
     total_set = present_set | missing_set
     if total_set:
         present = len(present_set)
         missing = len(missing_set)
         total = len(total_set)
         log.write(
             f"Reference download cache: {present} of {total} large files ({100 * present / total:3.1f} percent) already exist locally."
         )
         structured_report = {
             "summary_stats": {
                 "cache_requests": total,
                 "cache_hits": present,
                 "cache_misses": missing,
                 "cache_hit_rate": present / total
             },
             "per_request_stats": {
                 f: "hit" if f in present_set else "miss"
                 for f in sorted(total_set)
             }
         }
         log.log_event("Reference downloads cache efficiency report",
                       values=structured_report)
     idseq_dag.util.s3.make_space(
     )  # make sure to touch this stage's files before deleting LRU ones
Ejemplo n.º 18
0
    def get_wiki_content_for_page(taxid, pageid, taxname, taxid2wikicontent, mutex, semaphore, max_attempt=3):
        ''' Fetch wiki content for pageid '''
        for attempt in range(max_attempt):
            try:
                page = None
                if pageid:
                    log.write(f"fetching wiki {pageid} for {taxid}")
                    page = wikipedia.page(pageid=pageid)
                elif taxname:
                    search_results = wikipedia.search(taxname)
                    if len(search_results) > 0:
                        wikiname = str(search_results[0])
                        if taxname.lower() == wikiname.lower():
                            page = wikipedia.page(wikiname)
                    if not page:
                        # query the page directly
                        try:
                            page = wikipedia.page(taxname.replace(" ", "_"))
                        except:
                            page = None

                if page:
                    output = {
                        "pageid" : page.pageid,
                        "description": page.content[:1000],
                        "title": page.title,
                        "summary": page.summary
                    }
                    with mutex:
                        taxid2wikicontent[taxid] = output
                break
            except:
                log.write(f"having trouble fetching {taxid} wiki {pageid} attempt {attempt}")
        semaphore.release()
Ejemplo n.º 19
0
def main():
    from idseq_dag.engine.pipeline_flow import PipelineFlow
    import idseq_dag.util.log as log
    log.configure_logger()

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))
    parser.add_argument('dag_json', help='pipeline dag in json file format')
    parser.add_argument('--no-lazy-run', dest='lazy_run', action='store_false')
    parser.add_argument('--key-path-s3', dest='key_path_s3', help='ssh key')
    parser.add_argument('--no-versioned-output',
                        dest='versioned_output',
                        action='store_false')

    parser.set_defaults(lazy_run=True)
    parser.set_defaults(versioned_output=True)
    args = parser.parse_args()
    if args.key_path_s3:
        os.environ["KEY_PATH_S3"] = args.key_path_s3
    try:
        flow = PipelineFlow(lazy_run=args.lazy_run,
                            dag_json=args.dag_json,
                            versioned_output=args.versioned_output)
        log.write("everything is awesome. idseq dag is valid~")
    except:
        parser.print_help()
        raise
    log.write("start executing the dag")
    flow.start()
Ejemplo n.º 20
0
        def process_hit(hit_type, hit_name):
            hit_data = read_data if hit_type == "read" else contig_data

            if hit_name not in hit_data:
                log.write(f"Could not find {hit_type} in map: {hit_name}")
                return

            hit_obj = hit_data[hit_name]

            # hitsummary is more strict than reassigned.
            # Sometimes reassigned will have a value for accession, but hitsummary won't.
            if hit_obj["accession"] != accession_id:
                log.write(f"Mismatched accession for {hit_name}: {hit_obj['accession']} (reassigned) versus {accession_id} (hitsummary)")
                return

            (accession_start, accession_end) = _align_interval(_decrement_lower_bound((hit_obj["subject_start"], hit_obj["subject_end"])))

            # If the hit is larger than the bin size, treat it as an individual hit.
            if accession_end - accession_start >= bin_size:
                if hit_type == "read":
                    individual_reads.append(hit_name)
                else:
                    individual_contigs.append(hit_name)

            # Otherwise, put the hit into a bin based on its midpoint
            else:
                hit_midpoint = (accession_end + accession_start) / 2
                hit_bin_index = _floor_with_min(hit_midpoint / bin_size, 0)

                if hit_type == "read":
                    read_bins[hit_bin_index].append(hit_name)
                else:
                    contig_bins[hit_bin_index].append(hit_name)
Ejemplo n.º 21
0
    def _calculate_rpms(rpm_df, amr_df, total_reads):
        """Matches each gene in the amr_results dataframe to it's associated number of reads in
            the matched reads table and calculates the reads per million. Both total reads and
            reads per million are appended in order in lists and then bulk appended back in
            _append_rpm_to_results() to the proc_amr_results dataframe."""
        rpm_list = []
        total_reads_list = []
        for row in amr_df.itertuples():
            if len(rpm_df[rpm_df["allele"] == row.allele]
                   ["reads"].values) == 0:
                # This should never happen, but it has happened before
                # because there were typos in allele names in ARGannot_r2.fasta that caused mismatches with argannot_genome.bed.
                # Log an error. The following line will crash the pipeline step, which is intended.
                # We prefer failing the pipeline step to showing incorrect or missing data while failing silently to the user.
                log.write(
                    f"[Datadog] AmrAlleleMismatchError: {row.allele} (from ARGannot_r2.fasta) could not be found in argannot_genome.bed"
                )

            reads_for_allele = rpm_df[rpm_df["allele"] ==
                                      row.allele]["reads"].values[0]
            total_reads_list.append(reads_for_allele)
            rpm_for_allele = reads_for_allele * 1000000 / total_reads
            rpm_list.append(rpm_for_allele)

        return [rpm_list, total_reads_list]
Ejemplo n.º 22
0
 def print_update(self, t_elapsed):
     """Log an update after every polling period to indicate the code section is
     still active.
     """
     log.write(
         "LongRunningCodeSection %d (%s) still running after %3.1f seconds."
         % (self.id, self.name, t_elapsed))
    def thread_run(self):
        ''' Actually running the step '''
        self.status = StepStatus.STARTED
        self.update_status_json_file("instantiated")

        v = {"step": self.name}
        with log.log_context("dag_step", v):
            with log.log_context("substep_wait_for_input_files", v):
                self.wait_for_input_files()
            with log.log_context("substep_validate_input_files", v):
                self.validate_input_files()

            # If an input file error was detected, stop execution.
            if self.input_file_error:
                log.write("Invalid input detected for step %s" % self.name)
                self.status = StepStatus.INVALID_INPUT
                self.update_status_json_file("user_errored")
                return

            with log.log_context("substep_run", v):
                self.update_status_json_file("running")
                self.run()
            with log.log_context("substep_validate", v):
                self.validate()
            with log.log_context("substep_save_progress", v):
                self.save_progress()
            with log.log_context("substep_save_counts", v):
                self.save_counts()
        self.upload_thread = threading.Thread(target=self.uploading_results)
        self.upload_thread.start()
        self.status = StepStatus.FINISHED
        self.update_status_json_file("finished_running")
Ejemplo n.º 24
0
 def wrapped_operation(*args, **kwargs):
     randgen = None
     remaining_attempts = MAX_TRIES
     delay = 1.0
     while remaining_attempts > 1:
         try:
             t_start = time.time()
             return operation(*args, **kwargs)
         except:
             t_end = time.time()
             if randgen == None:
                 invocation[0] += 1
                 randgen = random.Random(
                     os.getpid() * 10000 + invocation[0]
                 ).random  # seed based on pid so subprocesses won't retry in lockstep
             if t_end - t_start > 30:
                 # For longer operations, the delay should increase, so that the backoff will meaningfully reduce load on the failing service.
                 delay = (t_end - t_start) * 0.2
             wait_time = delay * (1.0 + 2.0 * randgen())
             log.write(
                 f"Sleeping {wait_time} seconds before retry {MAX_TRIES - remaining_attempts + 1} of {operation} with {args}, {kwargs}."
             )
             time.sleep(wait_time)
             delay *= 3.0
             remaining_attempts -= 1
     # The last attempt is outside try/catch so caller can handle exception
     return operation(*args, **kwargs)
Ejemplo n.º 25
0
    def test_write_info_obj_data(self, mock_getLogger):
        '''Test writing a info object data to the logs'''

        log.write(obj_data={"name_a": 1, "name_b": "fake_string", "name_c": 3.1415})

        mock_getLogger.assert_has_calls([
            call().info(msg=None, extra={"obj_data": {"name_a": 1, "name_b": "fake_string", "name_c": 3.1415}})
        ])
Ejemplo n.º 26
0
def really_make_space():
    refdir = config['REF_DIR']
    while need_more_space(refdir):
        lru = find_oldest_reference(refdir)
        if not lru:
            log.write("WARNING:  Too little available space on instance, and could not find any reference downloads to delete.")
            break
        command.remove_rf(lru)
Ejemplo n.º 27
0
    def test_write_debug_text(self, mock_getLogger):
        '''Test writing a info text message to the logs'''

        log.write("Text message", debug=True)

        mock_getLogger.assert_has_calls([
            call().debug(msg='Text message', extra={"obj_data": None})
        ])
Ejemplo n.º 28
0
def execute(
    command: Union[command_patterns.CommandPattern, str],
    progress_file: str = None,
    timeout: int = None,
    grace_period: int = None,
    capture_stdout: bool = False,
    merge_stderr: bool = False,
    log_context_mode: log.LogContextMode = log.LogContextMode.
    START_END_LOG_EVENTS
) -> Union[str, None]:
    """Primary way to start external commands in subprocesses and handle
    execution with logging.
    """
    if not isinstance(command, command_patterns.CommandPattern):
        # log warning if using legacy format
        log.write(
            warning=True,
            message=
            f"Command parameter is using legacy type str. Use idseq_dag.util.command_patterns.",
            obj_data={
                "cmd": command,
                "type": type(command)
            })
        cmd = command_patterns.ShellScriptCommand(script=command, args=[])
    else:
        cmd = command

    with CommandTracker() as ct:
        log_values = {"cid": f"Command {ct.id}", "command": cmd.as_dict()}
        with log.log_context('command_execute',
                             values=log_values,
                             log_context_mode=log_context_mode) as lctx:
            with ProgressFile(progress_file):
                if timeout:
                    ct.timeout = timeout
                if grace_period:
                    ct.grace_period = grace_period
                if capture_stdout:
                    # Capture only stdout. Child stderr = parent stderr unless
                    # merge_stderr specified. Child input = parent stdin.
                    ct.proc = cmd.open(stdin=sys.stdin.fileno(),
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.STDOUT if merge_stderr
                                       else sys.stderr.fileno())
                    stdout, _ = ct.proc.communicate()
                else:
                    # Capture nothing. Child inherits parent stdin/out/err.
                    ct.proc = cmd.open()
                    ct.proc.wait()
                    stdout = None

                lctx.values.update({"returncode": ct.proc.returncode})

                if ct.proc.returncode:
                    raise subprocess.CalledProcessError(
                        ct.proc.returncode, str(command), stdout)
                if capture_stdout:
                    return stdout
Ejemplo n.º 29
0
    def test_write_info_non_serializable(self, mock_getLogger):
        '''Test writing log entry with a non serializable field'''
        non_serializable_byte_field = bytes()

        log.write(obj_data={"name_a": 1, "name_b": non_serializable_byte_field, "name_c": 3.1415})

        mock_getLogger.assert_has_calls([
            call().info(msg=None, extra={"obj_data": {"name_a": 1, "name_b": non_serializable_byte_field, "name_c": 3.1415}})
        ])
Ejemplo n.º 30
0
    def write_invalid_input_json(self, error_json):
        ''' Upload an invalid_step_input.json file for this step, which can be detected by other services like idseq-web. '''
        log.write("Writing invalid_step_input.json")
        local_input_errors_file = f"{self.output_dir_local}/invalid_step_input.json"

        with open(local_input_errors_file, 'w') as input_errors_file:
            json.dump(error_json, input_errors_file)

        idseq_dag.util.s3.upload_with_retries(local_input_errors_file, self.output_dir_s3 + "/")