def write_sample_file(gen_db_path,
                      fastq_list,
                      analysis_name, 
                      execution_folder,
                      analysis_id):
    
    GEN_DB = gendb_utils.DB()
    
    # update status
    gendb_utils.add_analysis_metadata(analysis_id, "airflow_execution_status", "running", update=True)
    
    if not isinstance(fastq_list, list):
        fastq_list = fastq_list.split(",")
    
    # id,fastq_prefix,R1,R2,species_name
    fastq_df = GEN_DB.get_fastq_and_sample_data(fastq_list)
    
    run_execution_folder = os.path.join(execution_folder, analysis_name)
    
    header = ["SampleName",
              "ScientificName",
              "R1",
              "R2", 
              "fastq_id"]
    
    with open(os.path.join(run_execution_folder, f'{analysis_name}.tsv'), 'w') as f:
        f.write("\t".join(header) + '\n')
        for n, row in fastq_df.iterrows():
            
            # deal with multiple fastq with same name
            R1 = row["R1"]
            R2 = row["R2"]
            fastq_id = row["fastq_id"]
            species = row["taxonomy"]
            sample_name = f'{row["sample_name"]}_{fastq_id}'
            f.write(f"{sample_name}\t{species}\t{R1}\t{R2}\t{fastq_id}\n")
def backup_output_files_samples(metadata_name2path_template, 
                                fastq_list,
                                analysis_name,
                                analysis_id,
                                backup_folder):
    
    GEN_DB = gendb_utils.DB()
    
    fastq_df = GEN_DB.get_fastq_and_sample_data(fastq_list)   
    
    qc_data = [] 
    for metadata_name in metadata_name2path_template:
        path_template = metadata_name2path_template[metadata_name]
        for n, sample in fastq_df.iterrows():

            sample_name = f'{sample["sample_name"]}_{sample["fastq_id"]}'
            
            # assume structure: {workflow}/{analysis_name}/{filepath}
            backup_path_format_relative = path_template.format(analysis_name=analysis_name,sample=sample_name)
            backup_path_format_absolute = os.path.join(backup_folder, '/'.join(backup_path_format_relative.split("/")[1:]))
            
            print("backup_path_format", backup_path_format_relative)
            if not os.path.exists(backup_path_format_absolute):
                print(f"WARNING: {backup_path_format_absolute} does not exit, skipping" )
                continue
            
            qc_data.append({"fastq_id": sample["fastq_id"],
                            "metrics_name": metadata_name,
                            "metrics_value": backup_path_format_relative,
                            "pipeline_version": ""})
    for qc in qc_data:
        print("inserting", analysis_id, qc["fastq_id"], qc["metrics_name"], qc["metrics_value"])
        GEN_DB.add_fastq_metadata(fastq_id=qc["fastq_id"],
                                  term_name=qc["metrics_name"],
                                  value=qc["metrics_value"],
                                  analysis_id=analysis_id)
Example #3
0
def parse_molis_xml(XML_TABLE, ):
    '''
<Cell ss:StyleID="th1"><Data ss:Type="Spath_listtring">N° de demande</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Période</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Numéro de demande</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Patient hospitalisé</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Demandeur</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Sexe</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Date de naissance</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Numéro de patient</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Unité de soins</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Numéro de projet</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Patient</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Numéro alias</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Référence externe</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Numéro patient externe</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Date saisie dem.</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Heure de saisie de la demande</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Date de réception</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Heure de réception</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Date prélèvement</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Heure de prélèvement</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Numéro de séjour</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Date dern. édition</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">L&apos;heure du compte-rendu</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Remarque interne</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Remarque sur compte rendu</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Renseignements cliniques</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Statut validation niv. 2 (Méd.)</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Matériel</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Adresse</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">CodePostal</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">Ville</Data></Cell> 
 <Cell ss:StyleID="th4"><Data ss:Type="String">Canton-Pays</Data></Cell> 
 <Cell ss:StyleID="th3"><Data ss:Type="String">COVTYP</Data></Cell>
 <Cell ss:StyleID="th3"><Data ss:Type="String">PREL</Data></Cell>
    '''
    GEN_DB = gendb_utils.DB()
    from xml.dom import minidom
    xmldoc = minidom.parse(XML_TABLE)

    itemlist = xmldoc.getElementsByTagName('Row')

    row_list = []
    for n, rows in enumerate(itemlist):
        item = rows.getElementsByTagName('Cell')
        if n == 0:
            columns = [
                cells.childNodes[0].childNodes[0].nodeValue
                if len(cells.childNodes[0].childNodes) > 0 else ''
                for cells in item
            ]
        else:
            row_list.append([
                cells.childNodes[0].childNodes[0].nodeValue
                if len(cells.childNodes[0].childNodes) > 0 else ''
                for cells in item
            ])

    df = pandas.DataFrame(row_list)
    df.columns = columns

    return df
def backup(execution_folder, 
           backup_folder,
           file_or_folder_list,
           analysis_id=False,
           analysis_name=False,
           fastq_list=False,
           output_selection=False,
           config=False,
           workflow_name=False,
           compress_ext=["fna", "faa", "gbk", "gbff", "vcf", "tsv", "csv", "gff"]):
    
    '''
    Analysis name: folder within execution_folder (generally execution date)
    Analysis_metadata: dictionnary of status to add to LIMS: 
        {"analysis_id" : <id>,
         "value": <value>}
    '''
    
    import shutil
    import glob
    if analysis_id:
        # update status
        print("analysis_id", analysis_id)
        gendb_utils.add_analysis_metadata(analysis_id, "airflow_execution_status", "running", update=True)
    
    print("backup_folder", backup_folder)
    
    # copy files and folders to backup directory
    for n, output in enumerate(file_or_folder_list):
        print("BACKUP LIST", n, output)
        # list with reference and target path
        if isinstance(output, list):
            print("LIST...")
            # copy files to specified target directory
            
            file_list = glob.glob(os.path.join(execution_folder, analysis_name, output[0]))
            target_dir = os.path.join(backup_folder, analysis_name, output[1])
            #print("file_list", file_list)
            for one_file in file_list:
                #print("original", one_file)
                target_abs_path = os.path.join(target_dir, os.path.basename(one_file))
                print("copy---", one_file, target_abs_path)
                copy_and_compress(one_file, target_abs_path, compress_ext)
                
        elif isinstance(output, dict):
            print("DICT...")
            # more complex copy with renaming
            # {glob: samples/*/mapping/bwa/*_assembled_genome.bam, regex: .*/samples/(.*)/mapping/bwa/(.*)_assembled_genome.bam, vars: {1: 'sample', 2: 'reference'}, target: "mapping/{sample}-vs-{reference}.bam", term: bam_file}
            import re
            GEN_DB = gendb_utils.DB()
            file_list = glob.glob(os.path.join(execution_folder, analysis_name, output["glob"]))
            #print("glob file list:", file_list)
            for one_file in file_list:
                s = re.search(output["regex"], one_file)
                term2value = {output["vars"][index]:s.group(index) for index in output["vars"]}
                term2value.update({'analysis_name': analysis_name})
                target_format = output["target"].format_map(term2value)
                target_path_full = os.path.join(backup_folder, '/'.join(target_format.split("/")[1:]))
                # copy file to target location
                if not os.path.exists(os.path.dirname(target_path_full)):
                    os.makedirs(os.path.dirname(target_path_full))
                #print("cp:", one_file, target_path_full)
                copy_and_compress(one_file, target_path_full, compress_ext)
                # save path in db
                if "term" in output:
                    fastq_id = term2value["sample"].split("_")[-1]
                    GEN_DB.add_fastq_metadata(fastq_id=fastq_id,
                                            term_name=output["term"],
                                            value=target_format,
                                            analysis_id=analysis_id)
        else:
            print("MIROR...")
            # simplest case
            # copy identical path
            output = output.format(analysis_name=analysis_name)
            original = os.path.join(execution_folder, analysis_name, output)
            target = os.path.join(backup_folder, analysis_name, output)
            # copy and compress what can be compressed
            copy_and_compress(original, target, compress_ext)

    # save file paths into database
    # can be either nested dictionnaries or a single dictionnary
    print("CONF", config, fastq_list)
    if config:
        if output_selection:
            output_selection = output_selection.split(",")
            metadata_lst = [value for key, value in config["WORKFLOW"][workflow_name]["PIPELINE_OUTPUT"]["ANALYSIS"].items() if key in output_selection]
            analysis_metadata_name2template = {k: v for d in metadata_lst for k, v in d.items()}
            
            if fastq_list:
                metadata_lst = [value for key, value in config["WORKFLOW"][workflow_name]["PIPELINE_OUTPUT"]["INDIVIDUAL_SAMPLES"].items() if key in output_selection]
                sample_metadata_name2template = {k: v for d in metadata_lst for k, v in d.items()}
        else:
            analysis_metadata_name2template = config["WORKFLOW"][workflow_name]["PIPELINE_OUTPUT"]["ANALYSIS"]
            if fastq_list:
                sample_metadata_name2template = config["WORKFLOW"][workflow_name]["PIPELINE_OUTPUT"]["INDIVIDUAL_SAMPLES"]
        print("backup path analysis", analysis_metadata_name2template)
        backup_output_files_analysis(analysis_metadata_name2template, 
                                     analysis_name,
                                     analysis_id,
                                     backup_folder)
        if fastq_list:
            print("fastq_list", fastq_list)
            print("sample_metadata_name2template", sample_metadata_name2template)
            fastq_list = fastq_list.split(",")
            backup_output_files_samples(sample_metadata_name2template, 
                                        fastq_list,
                                        analysis_name,
                                        analysis_id,
                                        backup_folder)
def write_snakemake_config_file(analysis_name,
                                fastq_list,
                                execution_folder,
                                snakemake_config,
                                gen_db_path,
                                analysis_id,
                                reference_list=False,
                                check_single_species=False,
                                reference_docx=False,
                                additional_args=False):

    GEN_DB = gendb_utils.DB()
    
    # update status
    gendb_utils.add_analysis_metadata(analysis_id, "airflow_execution_status", "running", update=True)

    run_execution_folder = os.path.join(execution_folder, analysis_name)
    
    species_list = list(set(GEN_DB.get_fastq_id2species(fastq_list.split(",")).values()))
    print("species_list", species_list)
    if check_single_species:
        if len(species_list) > 1:
            raise IOError("More than one different species in the dataset: %s" % ','.join(species_list))

    # if only one species, set scientific_name
    # otherwise Mixed
    if len(species_list) == 1:
        scientific_name = species_list[0]
    else:
        scientific_name = 'Mixed'
    
    print("reference list:", reference_list)
    # if references, prepare list
    if reference_list:
        reference_list = reference_list.split(",")
        fastq_df = GEN_DB.get_fastq_and_sample_data(reference_list)
        # check if external ref
        ref_list = [ref for ref in reference_list if str(ref) not in fastq_df["fastq_id"].astype(str).to_list()]

        if len(ref_list) != 0:
            print(f"WARNING: extrenal reference genome -- {ref_list[0]} ")
        ref_list += [f'{row["sample_name"]}_{row["fastq_id"]}' for n, row in fastq_df.iterrows()]
        #if 'cgMLST' in reference_list:
        #    ref_list.append("cgMLST")
        
    
    print("additional_args", additional_args)
    with open(os.path.join(run_execution_folder, f'{analysis_name}.config'), 'w') as f:
        # update sample table name
        snakemake_config["local_samples"] = f'{analysis_name}.tsv'
        if reference_list:
            print("ref list", ref_list)
            snakemake_config["reference"] = f'{",".join(ref_list)}'
        snakemake_config["species"] = f'{scientific_name}'
        if reference_docx:
            snakemake_config["reference_docx"] = f'{reference_docx}'
        if additional_args:
            for arg in additional_args:
                snakemake_config[arg] = additional_args[arg]

        documents = yaml.dump(snakemake_config, f)