def get_amplicon_json_file_path(scheme, scheme_version, scheme_dir): """ Get the file path to the amplicon primer JSON file Parameters ---------- scheme: Path Artic primer scheme to get data from scheme_version: str The version, if any scheme_dir: str The scheme directory Returns ------- pathlib.PosixPath File path to the JSON primer file. """ artic_results_dir_primers = ( Path(get_env_variable("MT_ARTIC_RESULTS_DIR")) / "artic/" / "primers_files" ) if not (Path(get_env_variable("MT_ARTIC_RESULTS_DIR")) / "artic/").exists(): (Path(get_env_variable("MT_ARTIC_RESULTS_DIR")) / "artic/").mkdir() # Needs to be dynamic here bed_file = f"{scheme}.scheme.bed" json_file = f"{scheme}_{scheme_version}.primers.json" full_path_to_bed_file = scheme_dir / scheme / scheme_version / bed_file if not (artic_results_dir_primers / json_file).exists(): json_file_path = convert_amplicon_bed_file_to_json( full_path_to_bed_file, json_file, artic_results_dir_primers ) else: json_file_path = artic_results_dir_primers / json_file return json_file_path
def create_minimap2_index(ref_info, file_name): """ Create the minimap2 index for the reference file that we are uploading Parameters ---------- ref_info: reference.models.ReferenceInfo The django ORM object file_name: pathlib.PosixPath The filename we are uploading Returns ------- str File path to the newly created minimap2 index file """ index_dir_path = (MEDIA_ROOT if get_env_variable("MT_MINIMAP2_INDEX_DIR").isdigit() else get_env_variable("MT_MINIMAP2_INDEX_DIR")) minimap2_index_file_location = ( f"{index_dir_path}/minimap2_indexes/{file_name.stem}.mmi") out, err = subprocess.Popen( f"{MINIMAP2} -d {minimap2_index_file_location}" f" {ref_info.file_location.path}".split()).communicate() print(out) print(err) return minimap2_index_file_location
def predict_barcode_will_finish( amplicon_median_array, num_barcodes, total_mapped_reads_count ): """ Predict if after Parameters ---------- amplicon_median_array: np.ndarray Median coverage for each amplicon num_barcodes: int The total number of barcodes detected in the run total_mapped_reads_count: Total number of reads that have mapped to n-cov in this task Returns ------- bool If this equation thinks the barcode will finish given enough time sequencing """ # 100000 reads per barcode in a run ideal_reads_count_constant = int(get_env_variable("MT_IDEAL_READ_CONSTANT")) minimum_required_amplicons = int(get_env_variable("MT_ARTIC_MIN_AMPS_PERC")) coverage_per_amplicon = int(get_env_variable("MT_COVERAGE_PER_AMPLICON")) predicted_coverages = ( amplicon_median_array / total_mapped_reads_count * num_barcodes * ideal_reads_count_constant ) return ( predicted_coverages[predicted_coverages > coverage_per_amplicon].size / amplicon_median_array.size ) * 100 > minimum_required_amplicons
def secure_artic_runs(): """ When run monitor is celery beaten, go through flowcells with a run_artic_pipeline command and check they have had data uploaded to them in the last 6 hours. If they haven't, trigger any barcodes that haven't been run, to tidy up all the sensitive files. Returns ------- """ logger.info( "Starting securing artic tasks for flowcells that haven't uploaded in 12 hours" ) jobs = JobMaster.objects.filter(job_type_id=16, complete=False) for artic_job in jobs: flowcell = artic_job.flowcell last_activity_date = flowcell.last_activity_date twelve_hours = datetime.timedelta( hours=int(get_env_variable("MT_ARTIC_TIME_UNTIL_CLEARING"))) three_hours = datetime.timedelta(hours=3) active = (last_activity_date < datetime.datetime.now(datetime.timezone.utc) - twelve_hours) trigger_barcodes = ( last_activity_date < datetime.datetime.now(datetime.timezone.utc) - three_hours) if trigger_barcodes: # so much unnecessary computing - need a way to mark it is final out side Artic Barcode Metadata trigger_all_barcodes_after_run(artic_job) if int(get_env_variable("MT_DESTROY_ARTIC_EVIDENCE")): # Not super happy with this, as it is affected by other things than read upload, # but can't think of an easy work around. If we aren't storing reads, we don't really update it apart from # when we upload a read batch # TOdo ideally we would add a run last activity time for this last_activity_date = flowcell.last_activity_date active = ( last_activity_date > datetime.datetime.now(datetime.timezone.utc) - twelve_hours) if not active: results_dir = make_results_directory_artic( flowcell.id, artic_job.id) for barcode_name in ArticBarcodeMetadata.objects.filter( job_master=artic_job).values_list("barcode__name", flat=True): clear_unused_artic_files(str(results_dir / barcode_name), barcode_name, flowcell.id) artic_job.complete = True artic_job.save() logger.info("Finished securing artic tasks")
def get_results_modal_html(request, pk): """ return the html for the modal for the all results download functionality Parameters ---------- request: rest_framework.request.Request The ajax request body pk: int The primary key of the flowcell object in the database Returns ------- html """ results_files_extra = [ ("Input fasta", "input-fasta"), ("Sorted Bam", "sorted-bam"), ("Sorted Bam Index", "sorted-bam-bai"), ] results_files = [ ("Consensus sequence", "consensus"), ("Box plot", "box-plot"), ("Bar plot", "bar-plot"), ("Fail VCF", "fail-vcf"), ("Pass VCF", "pass-vcf"), ("Pangolin lineages CSV", "pangolin-lineages"), ] if not int(get_env_variable("MT_DESTROY_ARTIC_EVIDENCE")): results_files.extend(results_files_extra) context_dict = {"hidden_results_files": results_files} return render(request, "all-results-modal.html", context={"context": context_dict})
def make_results_directory_artic(flowcell_id, task_id, allow_create=True): """ Make a results directory Parameters ---------- flowcell_id: int Primary key of the flowcell entry in the database task_id: int Primary key of the task record in the database. allow_create: bool Allow the creation of the directory if it doesn't already exist Returns ------- results_dir: pathlib.PosixPath PosixPath pointing to the results directory """ environmental_results_directory = get_env_variable("MT_ARTIC_RESULTS_DIR") artic_dir = Path(f"{environmental_results_directory}/artic/") if not artic_dir.exists() and allow_create: Path.mkdir(artic_dir) results_dir = Path(f"{environmental_results_directory}/artic/Temp_results") if not results_dir.exists() and allow_create: Path.mkdir(results_dir) results_dir = Path( f"{environmental_results_directory}/artic/Temp_results/{flowcell_id}_{task_id}_artic" ) if not results_dir.exists() and allow_create: Path.mkdir(results_dir) return results_dir
def active(self): """ Determine whether this flowcell has been active in the 48 hours :return: """ # time deltas are pythons measurement of time difference delta = datetime.timedelta( days=int(get_env_variable("MT_TIME_UNTIL_INACTIVE"))) # If the current time minus two days is more than the last activity date, there has been no activity in 48 hours if (datetime.datetime.now(datetime.timezone.utc) - delta) > self.last_activity_date: return False # Activity in the last 48 hours return True
def create_archive_tasks(): """ Create archive tasks for flowcells that are more than X days since last use, don't archive if set to -1 Returns ------- """ time_until_inactive = get_env_variable("MT_TIME_UNTIL_ARCHIVE") if time_until_inactive == 0: return delta = datetime.timedelta(days=int(time_until_inactive)) for flowcell in Flowcell.objects.filter(archived=False): if flowcell.last_activity_date < datetime.datetime.now( datetime.timezone.utc) - delta: jm, created = JobMaster.objects.get_or_create(job_type_id=18, flowcell=flowcell) if created: logger.info(f"Marking flowcell: {flowcell} for archiving.")
def clear_artic_data(job_master): """ Clear the artic files from the system drive Parameters ---------- job_master: reads.models.JobMaster The job master ORM object of the track artic job Returns ------- exit_code: int 0 if successful, 1 if not """ environmental_results_directory = get_env_variable("MT_ARTIC_RESULTS_DIR") results_dir = Path( f"{environmental_results_directory}/artic/Temp_results/{job_master.flowcell.id}_{job_master.id}_artic" ) if not results_dir.exists(): return 1 else: # clear pngs from artic static dir rmtree(results_dir, onerror=on_delete_error) return 0
def ready(self): """ Hook that is called when the Artic app is initialised and ready. Code below is ready. Returns ------- """ from artic.utils import check_artic_static_exists, update_pangolin check_artic_static_exists() from minotourapp.utils import get_env_variable MT_VoC_PATH = get_env_variable("MT_VoC_PATH") if Path(f"{MT_VoC_PATH}").exists(): print("VoC Path Found") ##Check if # cloned_repo = Repo.clone(os.path.join("https://github.com/phe-genomics/variant_definitions", Path(f"{MT_VoC_PATH}"))) if Path(f"{MT_VoC_PATH}/variant_definitions/").exists(): # already cloned so.... print("Updating path") try: repo = Repo(Path(f"{MT_VoC_PATH}/variant_definitions/")) print(repo.remotes.origin.pull()) pass except git.GitCommandError as e: print( f"Git error, presumably being updated simultaneously {repr(e)}" ) else: try: cloned_repo = Repo.clone_from( "https://github.com/phe-genomics/variant_definitions", f"{MT_VoC_PATH}/variant_definitions/", ) except git.GitCommandError as e: print( f"Git error, presumably being updated simultaneously {repr(e)}" ) update_pangolin()
""" Create single redis instance and import it around where it is needed """ import redis from minotourapp.utils import get_env_variable redis_instance = redis.StrictRedis( host="127.0.0.1", port=6379, db=1, decode_responses=True) if "localhost" in get_env_variable( "MT_DJANGO_REDIS_URL") else redis.StrictRedis( unix_socket_path=get_env_variable("MT_DJANGO_REDIS_URL").split( "//")[-1], decode_responses=True)
def get_artic_barcode_metadata_html(request): """ Parameters ---------- request: rest_framework.request.Request Request body, params: the flowcell PK and selected barcode Returns ------- """ flowcell_id = request.GET.get("flowcellId", None) selected_barcode = request.GET.get("selectedBarcode", None) if not flowcell_id or not selected_barcode: return Response( "No flowcell ID or barcode provided.", status=status.HTTP_400_BAD_REQUEST ) # see if we have a command waiting to be run try: artic_command_jm = bool( JobMaster.objects.filter( job_type_id=17, barcode__name=selected_barcode, flowcell_id=flowcell_id ) ) except JobMaster.DoesNotExist: artic_command_jm = False orm_object = ArticBarcodeMetadata.objects.filter( flowcell_id=flowcell_id, barcode__name=selected_barcode ).last() if not orm_object: return Response("No data found", status=status.HTTP_404_NOT_FOUND) # First iteration we may may not have FlowcellSumamryBarcodes, so cal if not orm_object.percentage_of_reads_in_barcode: try: barcode_numbers = FlowcellSummaryBarcode.objects.get( flowcell_id=flowcell_id, barcode_name=selected_barcode ) all_numbers = FlowcellSummaryBarcode.objects.filter( flowcell_id=flowcell_id, barcode_name="All reads" ).values_list("read_count", flat=True) total_reads = 0 for all_number in all_numbers: total_reads += all_number orm_object.percentage_of_reads_in_barcode = ( barcode_numbers.read_count / total_reads * 100 ) except FlowcellSummaryBarcode.DoesNotExist as e: orm_object.percentage_of_reads_in_barcode = 0 # [[new key, old key]] new_key_names = [ ["Avg. Coverage", "average_coverage"], ["Var. Coverage", "variance_coverage"], ["Min. Coverage", "minimum_coverage"], ["Max. Coverage", "maximum_coverage"], ["% reads in run", "percentage_of_reads_in_barcode"], ["Has Finished", "has_finished"], ["Has Sufficient Coverage", "has_sufficient_coverage"], ] results_files = [ ("Consensus sequence", "consensus"), ("Box plot", "box-plot"), ("Bar plot", "bar-plot"), ("Fail VCF", "fail-vcf"), ("Pass VCF", "pass-vcf"), ("Input fasta", "input-fasta"), ("Pangolin lineages CSV", "pangolin-lineages"), ("Sorted Bam", "sorted-bam"), ("Sorted Bam index", "sorted-bam-bai"), ] old_dict = orm_object.__dict__ context_dict = {key[0]: old_dict[key[1]] for key in new_key_names} context_dict["hidden_barcode_pk"] = orm_object.barcode.id context_dict["hidden_barcode_name"] = orm_object.barcode.name context_dict["hidden_flowcell_id"] = flowcell_id context_dict["hidden_job_master_id"] = orm_object.job_master.id context_dict["hidden_results_files"] = results_files context_dict["hidden_has_finished"] = old_dict["has_finished"] context_dict["hidden_has_suff"] = old_dict["has_sufficient_coverage"] context_dict["hidden_marked_for_rerun"] = old_dict["marked_for_rerun"] context_dict["hidden_destroy_evidence"] = bool( int(get_env_variable("MT_DESTROY_ARTIC_EVIDENCE")) ) context_dict["hidden_triggered_by_cleanup"] = ( orm_object.has_finished and not orm_object.has_sufficient_coverage ) context_dict["hidden_has_command_job_master"] = artic_command_jm ( flowcell, artic_results_path, artic_task_id, _, ) = quick_get_artic_results_directory(flowcell_id) fastq_path = artic_results_path / selected_barcode / f"{selected_barcode}.fastq" fastq_path_gz = fastq_path.with_suffix(".fastq.gz") context_dict["hidden_has_fastq"] = fastq_path.exists() or fastq_path_gz.exists() if context_dict["hidden_has_finished"]: csv_path = artic_results_path / selected_barcode / "lineage_report.csv.gz" if csv_path.exists(): df = pd.read_csv( artic_results_path / selected_barcode / "lineage_report.csv.gz" ) html_string = df.T.to_html(classes="table table-striped", border=0) context_dict["hidden_html_string"] = html_string return render( request, "artic-barcode-metadata.html", context={"artic_barcode_metadata": context_dict}, )
import os import tempfile from celery.schedules import crontab from kombu import Exchange, Queue # Build paths inside the project like this: os.path.join(BASE_DIR, ...) from minotourapp.utils import get_env_variable BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = get_env_variable("MT_SECRET_KEY") # DEBUG = bool(os.environ.get('MT_DJANGO_DEBUG', True)) ALLOWED_HOSTS = [ '*', ] # Application definition INSTALLED_APPS = [ 'django.contrib.admin', 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles',
def run_centrifuge(flowcell_job_id, streamed_reads=None): """ Run the metagenomics subprocess command, returning the data from it as a DataFrame. Parameters ---------- flowcell_job_id: int The primary key of the flowcell ID streamed_reads: list of dict A list of dictionaries containing read information Returns ------- pd.core.frame.DataFrame, int, int, int, pandas.core.frame.DataFrame, int, int Dataframe of metagenomics results, total output lines from metagenomics, last read primary key, total count of reads analysed, dataframe of any reads that identified as targets, number of reads with classifications, number of reads without classifications """ # The JobMaster object task = JobMaster.objects.get(pk=flowcell_job_id) # The flowcell the reads are from flowcell = task.flowcell avg_read_length = int(flowcell.average_read_length) if avg_read_length == 0: logger.error( f"Average read length is zero Defaulting to 450, but this is an error." ) avg_read_length = 1000 if not streamed_reads and not isinstance(streamed_reads, list): read_count, last_read, fasta_df_barcode = get_fastq_df( flowcell_pk=int(flowcell.id), desired_yield=50, avg_read_len=avg_read_length, task=task, ) else: last_read = task.last_read fasta_df_barcode = pd.DataFrame(streamed_reads) if not fasta_df_barcode.empty: fasta_df_barcode = fasta_df_barcode.rename(columns={ "type": "read_type_id", "barcode": "barcode_id" }) fasta_df_barcode["type__name"] = fasta_df_barcode["read_type_id"] read_count = fasta_df_barcode.shape[0] if fasta_df_barcode.empty: return pd.DataFrame(), None, None, None, None, 0, 0 logger.debug("Flowcell id: {} - number of reads found {}".format( flowcell.id, read_count)) # Create a fastq string to pass to Centrifuge fasta_df_barcode["fasta"] = (">read_id=" + fasta_df_barcode["read_id"] + ",barcode=" + fasta_df_barcode["barcode_name"] + "\n" + fasta_df_barcode["sequence"]) fastqs_data = "\n".join(list(fasta_df_barcode["fasta"])) logger.info("Flowcell id: {} - Loading index and Centrifuging".format( flowcell.id)) # Write the generated fastq file to stdin, passing it to the command # Use Popen to run the metagenomics command # The path to the metagenomics executable centrifuge_path = get_env_variable("MT_CENTRIFUGE") # The path to the Centrifuge Index index_path = get_env_variable("MT_CENTRIFUGE_INDEX") # The command to run metagenomics cmd = "perl " + centrifuge_path + " -f --mm -k 3 -x " + index_path + " -" try: out, err = subprocess.Popen( cmd.split(), preexec_fn=lambda: os.nice(-10), stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, ).communicate(input=str.encode(fastqs_data)) except subprocess.SubprocessError as e: logger.warning(f"{e}, running with standard niceness index.") out, err = subprocess.Popen( cmd.split(), stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, ).communicate(input=str.encode(fastqs_data)) # The standard error # out is a bytestring so it needs decoding if not out: logger.info( "Flowcell id: {} - No reads found or no metagenomics output." " Check above for error".format(flowcell.id)) task.running = False task.save() return None centrifuge_output = out.decode() # total number of lines of metagenomics output dealt with total_centrifuge_output = centrifuge_output.count("\n") - 1 logger.info( "Flowcell id: {} - number of metagenomics output lines is {}".format( flowcell.id, total_centrifuge_output)) # output fields is the column headers for the pandas data frame output_fields = ["readID", "seqID", "taxID", "numMatches"] # create the DataFrame from the output df = pd.read_csv(StringIO(centrifuge_output), sep="\t", usecols=output_fields) # split out the barcode_name from the readID string df = split_read_id_and_barcodes(df) individual_reads_classified = np.unique(df["readID"].values).size targets_df = separate_target_cent_output(df, task, fasta_df_barcode) # The number of reads we have any form of classification for reads_classified = np.unique(df[df["tax_id"].ne(0)]["read_id"].values).size # The number of reads we have completely failed to classify reads_unclassified = np.unique( df[df["tax_id"].eq(0)]["read_id"].values).size # save the values # Get the metadata object. Contains the start time, end time and runtime of the task metadata, created = Metadata.objects.get_or_create(task=task) return ( df, individual_reads_classified, read_count, last_read, targets_df, reads_classified, reads_unclassified, )
def save_reads_bulk(reads): """ Save reads into redis after they arrive from minFQ, and to the database for tasks to be run on them later. Parameters ---------- reads: list of dict A list of reads in dictionary form sent from minFQ Returns ------- None """ flowcell_dict = {} reads_list = [] run_dict = {} for read in reads: run_pk = read.get("run", -1) if run_pk not in run_dict and run_pk != -1: run = Run.objects.get(pk=run_pk) run_dict[run_pk] = run read["run_id"] = run.id read["flowcell_id"] = run.flowcell.id else: read["run_id"] = run_dict[run_pk].id read["flowcell_id"] = run_dict[run_pk].flowcell.id if read["flowcell_id"] not in flowcell_dict: f = Flowcell.objects.get(pk=read["flowcell_id"]) if f.archived: f.archived = False f.last_activity_date = datetime.now(timezone.utc) f.save() flowcell_dict[read["flowcell_id"]] = 1 fastq_read = FastqRead( read_id=read["read_id"], read=read["read"], channel=read["channel"], barcode_id=read["barcode"], rejected_barcode_id=read["rejected_barcode"], barcode_name=read["barcode_name"], sequence_length=read["sequence_length"], quality_average=read["quality_average"], sequence=read["sequence"], quality=read["quality"], is_pass=read["is_pass"], start_time=read["start_time"], run_id=read["run_id"], flowcell_id=read["flowcell_id"], type_id=read["type"], fastqfile_id=read["fastq_file"], ) reads_list.append(fastq_read) # Save reads to redis for later processing of base-called data summaries. reads_as_json = json.dumps(reads) ### We want to pause to let the number of chunks get below 10? count = redis_instance.scard("reads") while count > 40: time.sleep(5) count = redis_instance.scard("reads") redis_instance.sadd("reads", reads_as_json) # Bulk create the entries skip_sequence_saving = int(get_env_variable("MT_SKIP_SAVING_SEQUENCE")) if not skip_sequence_saving: FastqRead.objects.bulk_create(reads_list, batch_size=1000)
def map_target_reads(task, path_to_reference, target_df, to_save_df, target_region_df): """ Map the reads Parameters ---------- task: reads.models.JobMaster The django ORM object of this task path_to_reference: pathlib.PosixPath The path to the concatenated, Gzipped reference file for all references in target set target_df: pd.core.frame.DataFrame Target reads dataframe, containing read sequence to_save_df: pd.core.frame.DataFrame The finalised metagenomics output data, with num_matches, lineages etc. target_region_df: pd.core.frame.DataFrame The regions defined in the GFF file that contain virulence areas Returns ------- pd.core.frame.DataFrame Dataframe of minimap2 output from mapped target reads and num_mapped """ minimap2_executable_path = get_env_variable("MT_MINIMAP2") cmd = f"{minimap2_executable_path} -x map-ont {path_to_reference} -" # target_df = pd.merge(target_df, to_save_df, on="tax_id") target_df["unique"] = np.where(target_df["numMatches"] == 1, 1, 0) gb = target_df.groupby(["tax_id", "barcode_name"]) target_df.set_index(["tax_id", "barcode_name"], inplace=True) target_df["num_matches"] = gb.size() target_df["sum_unique"] = gb["unique"].sum() target_df.reset_index(inplace=True) taxid_list = np.unique(target_df["tax_id"].values) # use the ncbi thing to get the species names for the tax ids in the tax ids list ncbi = NCBITaxa() taxid_2_name = ncbi.get_taxid_translator(taxid_list) target_df["name"] = target_df["tax_id"].map(taxid_2_name) fasta_sequence_to_map = "\n".join( (">" + target_df["read_id"] + "\n" + target_df["sequence"]).values.tolist() ) start_metagenomics_mapping_task(task.flowcell.id, task.target_set, target_df) # TODO merge mapping task into this function rather than above seperate mapping task creation process = subprocess.Popen( cmd.split(), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) out, err = process.communicate(input=fasta_sequence_to_map.encode()) if not out: logger.info(f"No reads mapped.") return pd.DataFrame() target_set_plasmids_ref_pks = MappingTarget.objects.filter( target_set=task.target_set ).values_list("reference_id", flat=True) reference_contig_names = ( ReferenceInfo.objects.filter(pk__in=target_set_plasmids_ref_pks) .values_list("reference_lines__line_name", "name") .distinct() ) contig_to_reference_dict = { contig_name: ref_species for contig_name, ref_species in reference_contig_names } map_out_df = pd.read_csv(StringIO(out.decode()), sep="\t", header=None) map_out_df.rename( columns={ 0: "read_id", 1: "query_seq_len", 2: "query_start", 3: "query_end", 4: "rel_strand", 5: "target_seq_name", 6: "target_seq_length", 7: "target_start", 8: "target_end", 9: "num_residue_matches", 10: "alignment_block_length", 11: "mapping_qual", }, inplace=True, ) map_out_df["name"] = map_out_df["target_seq_name"].map(contig_to_reference_dict) map_out_df["num_residue_matches"] = map_out_df["num_residue_matches"].astype( np.int64 ) # Filter out low quality matches or the map_out_df = map_out_df.query("mapping_qual >= 40 & num_residue_matches >= 200") if map_out_df.empty: logger.info("Insufficient quality mappings.") return pd.DataFrame() # See whether the start or end of a mapping falls into the region map_out_df["read_is_red"] = 0 map_out_df["read_is_red"] += target_region_df.apply( falls_in_region, args=(map_out_df,), axis=1 ) map_out_df["read_is_red"] = np.where(map_out_df["read_is_red"], 1, 0) map_out_df["name"] = map_out_df["name"].str.replace("_", " ") map_out_df = pd.merge(map_out_df, target_df, how="left", on=["read_id"]) map_out_df["barcode_name"] = map_out_df["read_id"].map( target_df.set_index("read_id")["barcode_name"].loc[ ~target_df.set_index("read_id")["barcode_name"].index.duplicated() ] ) map_out_df = map_out_df.fillna(0) map_out_df["barcode_name"] = np.where( map_out_df["barcode_name"] == "No_barcode", "All reads", map_out_df["barcode_name"], ) gb = map_out_df.groupby(["barcode_name", "name_y"]) map_out_df.set_index(["barcode_name", "name_y"], inplace=True) map_out_df["num_mapped"] = gb.size() map_out_df["num_red_reads"] = gb["read_is_red"].sum() map_out_df = map_out_df.loc[~map_out_df.index.duplicated()] map_out_df.reset_index(inplace=True) return map_out_df
def handle(self, *args, **options): """ Handle the execution of the command :param args: The arguments, whether they are present or not :param options: The values that have been added to the arguments :return: """ try: reference_files = [] # These should be lowercase and include the '.' endings = { ".fna", ".fa", ".fasta", ".fsa", } if not options["key"]: print( "To add references, your minotour api_key is required. " "This can be found on the profile page of your account." ) return for file_or_directory in options["reference"]: reference_files.extend(find_files_of_type(file_or_directory, endings)) if not reference_files: raise FileNotFoundError( f"No files found at specified location! Endings included are {pformat(endings)}" ) private = False # If we want private references user = Token.objects.get(key=options["key"]).user if options["private"]: private = True # remove none from reference_files reference_files = list(filter(None.__ne__, reference_files)) previous_ref = set( ReferenceInfo.objects.filter(private=False) .values_list("name", flat=True) .distinct() ) # If it's private check we aren't multiplying an already existing private reference if options["private"]: previous_ref = previous_ref.union( set( ReferenceInfo.objects.filter(private=True, uploader=user) .values_list("name", flat=True) .distinct() ) ) for ref_file in reference_files: # Get the species name of this reference, no file suffixes ref_file_stem = str(ref_file.stem).partition(".")[0] print("Processing file: {}".format(ref_file.name)) if ref_file_stem in previous_ref: print( "A reference already exists for this species name: {}".format( ref_file_stem ) ) print( "If you believe this is in error, or want to add this reference anyway," " please change the filename" ) continue duplicated, sha256_hash = validate_reference_checks(ref_file, user) if duplicated: return ## get fastq or fasta handle = ( pyfastx.Fastq if set(ref_file.suffixes).intersection({".qz", ".gzip"}) else pyfastx.Fasta ) # Check that the minimap2 index location folder exists index_dir_path = ( MEDIA_ROOT if get_env_variable("MT_MINIMAP2_INDEX_DIR").isdigit() else get_env_variable("MT_MINIMAP2_INDEX_DIR") ) minimap2_index_path = f"{index_dir_path}/minimap2_indexes/" if not Path(minimap2_index_path).exists(): raise FileNotFoundError( f"Minimap2 index directory does not exist at {minimap2_index_path}. Please create it!" ) # build minimap2 index minimap2_index_path += f"{ref_file.stem}.mmi" print("Building minimap2 index, please wait.....") out, err = subprocess.Popen( f"{MINIMAP2} -d {minimap2_index_path}" f" {ref_file.as_posix()}".split() ).communicate() print("Minimap2 index building output - ") print(out) print("\n") print(err) print("Built index. Parsing file...") # Individual lines (I.E Chromosomes in the reference) fa = handle(ref_file.as_posix()) # Create the Reference info entry in the database ref_info, created = ReferenceInfo.objects.update_or_create( name=ref_file_stem, file_location=ref_file.resolve().as_posix(), file_name=ref_file.name, length=fa.size, private=private, uploader=user, minimap2_index_file_location=minimap2_index_path, sha256_checksum=sha256_hash, ) # Create a Reference line entry for each "Chromosome/line" for contig in fa: ReferenceLine.objects.create( reference=ref_info, line_name=contig.name, chromosome_length=len(contig), ) print("Successfully handled file.") except Exception as e: raise CommandError(e)