Example #1
0
def select_pass_variants(raw_vcf, nocall_fr=0.1):
    """
    Filter a vcf file. Output a vcf file with PASS positions adding a .pass to the output file
    Used since it creates the neccesasary vcf index
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php
    https://gatkforums.broadinstitute.org/gatk/discussion/13127/do-gatk4-tools-ignore-vcf-sites-marked-as-filtered-or-must-they-be-removed-from-the-file
    """
    #max_nocall=2,

    input_vcf = os.path.abspath(raw_vcf)
    check_file_exists(input_vcf)

    raw_vcf_file_name = (".").join(input_vcf.split(".")[:-1])

    extension = ".pass.vcf"
    vcf_selected_output_file = raw_vcf_file_name + extension

    cmd = [
        "gatk", "SelectVariants", "--variant", input_vcf,
        "--max-nocall-fraction",
        str(nocall_fr), "--exclude-filtered", "--remove-unused-alternates",
        "--output", vcf_selected_output_file
    ]

    #"--max-nocall-number", str(max_nocall),
    execute_subprocess(cmd)
Example #2
0
def select_variants(raw_vcf, select_type='SNP'):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php
    gatk SelectVariants -V cohort.vcf.gz -select-type SNP -O snps.vcf.gz
    """
    if select_type == "SNP":
        extension = ".snp.vcf"
    elif select_type == "INDEL":
        extension = ".indel.vcf"
    else:
        print(RED + BOLD + "Choose a correct type to filter" + END_FORMATTING)

    input_vcf = os.path.abspath(raw_vcf)
    check_file_exists(input_vcf)

    raw_vcf_file_name = (".").join(input_vcf.split(".")[:-2])
    #file_name = raw_vcf_file_name.split("/")[-1] #sample_name

    vcf_selected_output_file = raw_vcf_file_name + extension

    #memory_param = "-Xmx" + str(args.memory) + "g"
    #"--java-options", memory_param,

    cmd = [
        "gatk", "SelectVariants", "--variant", input_vcf,
        "--select-type-to-include", select_type, "--select-type-to-include",
        "MIXED", "--output", vcf_selected_output_file
    ]

    #    "--remove-unused-alternates",

    execute_subprocess(cmd)
Example #3
0
def ddtb_compare(final_database):

    database_file = os.path.abspath(final_database)
    check_file_exists(database_file)
    presence_ddbb = import_to_pandas(database_file, header=True)

    output_path = database_file.split(".")[0]

    print("Output path is: " + output_path)


    print(BLUE + BOLD + "Comparing all samples in " + database_file + END_FORMATTING)
    prior_pairwise = datetime.datetime.now()

    #Calculate pairwise snp distance for all and save file
    print(CYAN + "Pairwise distance" + END_FORMATTING)
    pairwise_file = output_path + ".snp.pairwise.tsv"
    snp_distance_pairwise(presence_ddbb, pairwise_file)
    after_pairwise = datetime.datetime.now()
    print("Done with pairwise in: %s" % (after_pairwise - prior_pairwise))

    #Calculate snp distance for all and save file
    print(CYAN + "SNP distance" + END_FORMATTING)
    snp_dist_file = output_path + ".snp.tsv"
    snp_distance_matrix(presence_ddbb, snp_dist_file)

    #Calculate hamming distance for all and save file
    print(CYAN + "Hamming distance" + END_FORMATTING)
    hmm_dist_file = output_path + ".hamming.tsv"
    hamming_distance_matrix(presence_ddbb, hmm_dist_file)
    """
    #Represent pairwise snp distance for all and save file
    print(CYAN + "Drawing distance" + END_FORMATTING)
    prior_represent = datetime.datetime.now()
    png_dist_file = output_path + ".snp.distance.png"
    #clustermap_dataframe(presence_ddbb, png_dist_file)
    after_represent = datetime.datetime.now()
    print("Done with distance drawing in: %s" % (after_represent - prior_represent))
    """
    #Represent dendrogram snp distance for all and save file
    print(CYAN + "Drawing dendrogram" + END_FORMATTING)
    png_dend_file = output_path + ".snp.dendrogram.png"
    dendogram_dataframe(presence_ddbb, png_dend_file)

    #Output a Newick file distance for all and save file
    print(CYAN + "Newick dendrogram" + END_FORMATTING)
    newick_file = output_path + ".nwk"
    linkage_to_newick(presence_ddbb, newick_file)

    #Output a binary snp matrix distance in rdf format
    print(CYAN + "rdf format" + END_FORMATTING)
    rdf_file = output_path + ".rdf"
    matrix_to_rdf(presence_ddbb, rdf_file)

    #Output a list of all common snps in group compared
    print(CYAN + "Common SNPs" + END_FORMATTING)
    common_file = output_path + ".common.txt"
    matrix_to_common(presence_ddbb, common_file)
Example #4
0
def ddtb_compare(args):

    database_file = os.path.abspath(args.final_database)
    check_file_exists(database_file)
    presence_ddbb = import_to_pandas(database_file, header=True)

    if args.output_file:
        output_file = os.path.abspath(args.output_file)
        output_path = output_file.split(".")[0]
    else:
        output_path = database_file.split(".")[0]

    print("Output path is: " + output_path)

    if args.all_compare:
        print(BLUE + BOLD + "Comparing all samples in " + database_file +
              END_FORMATTING)
        prior_pairwise = datetime.datetime.now()

        #Calculate pairwise snp distance for all and save file
        print(CYAN + "Pairwise distance" + END_FORMATTING)
        pairwise_file = output_path + ".snp.pairwise.tsv"
        snp_distance_pairwise(presence_ddbb, pairwise_file)
        after_pairwise = datetime.datetime.now()
        print("Done with pairwise in: %s" % (after_pairwise - prior_pairwise))

        #Calculate snp distance for all and save file
        print(CYAN + "SNP distance" + END_FORMATTING)
        snp_dist_file = output_path + ".snp.tsv"
        snp_distance_matrix(presence_ddbb, snp_dist_file)

        #Calculate hamming distance for all and save file
        print(CYAN + "Hamming distance" + END_FORMATTING)
        hmm_dist_file = output_path + ".hamming.tsv"
        hamming_distance_matrix(presence_ddbb, hmm_dist_file)
        """
        #Represent pairwise snp distance for all and save file
        print(CYAN + "Drawing distance" + END_FORMATTING)
        prior_represent = datetime.datetime.now()
        png_dist_file = output_path + ".snp.distance.png"
        #clustermap_dataframe(presence_ddbb, png_dist_file)
        after_represent = datetime.datetime.now()
        print("Done with distance drawing in: %s" % (after_represent - prior_represent))
        """
        #Represent dendrogram snp distance for all and save file
        print(CYAN + "Drawing dendrogram" + END_FORMATTING)
        png_dend_file = output_path + ".snp.dendrogram.png"
        dendogram_dataframe(presence_ddbb, png_dend_file)

        #Output a Newick file distance for all and save file
        print(CYAN + "Newick dendrogram" + END_FORMATTING)
        newick_file = output_path + ".nwk"
        linkage_to_newick(presence_ddbb, newick_file)

    else:
        print("sample mode is not implemented")
    """
Example #5
0
def hard_filter(selected_vcf, select_type='SNP'):
    """
    https://software.broadinstitute.org/gatk/documentation/article.php?id=6925
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_filters_VariantFiltration.php
    https://software.broadinstitute.org/gatk/documentation/article?id=23216
    SNP:
    gatk VariantFiltration -V snps.vcf.gz "--filter-expression", "QD < 2.0", "--filter-name", "QD2" \
    "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30" "--filter-expression", "SOR > 3.0", "--filter-name", "SOR3" "--filter-expression", "FS > 60.0", "--filter-name", "FS60" \
    "--filter-expression", "MQ < 40.0", "--filter-name", "MQ40" "--filter-expression", "MQRankSum < -12.5", "--filter-name", "MQRankSum-12.5" "--filter-expression", "ReadPosRankSum < -8.0" \
   , "--filter-name", "ReadPosRankSum-8" -O snps_filtered.vcf.gz
    INDEL:
    gatk VariantFiltration -V indels.vcf.gz "--filter-expression", "QD < 2.0", "--filter-name", "QD2" "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30" \
    -"--filter-expression", "FS > 200.0", "--filter-name", "FS200" -"--filter-expression", "ReadPosRankSum < -20.0", "--filter-name", "ReadPosRankSum-20" -O indels_filtered.vcf.gz
    #--filterExpression "QD<2.0||FS>60.0||MQ<40.0||MQRankSum<-12.5||ReadPosRankSum<-8.0" --filterName "my_snp_filter" 
    """

    input_vcf = os.path.abspath(selected_vcf)
    check_file_exists(input_vcf)

    selected_vcf_file_name = (".").join(input_vcf.split(".")[:-2])

    if select_type == "SNP":
        extension = ".snp.hf.vcf"
        vcf_hard_filtered_output_file = selected_vcf_file_name + extension
        cmd = [
            "gatk", "VariantFiltration", "--variant", input_vcf,
            "--filter-expression", "QD < 2.0", "--filter-name", "QD2",
            "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30",
            "--filter-expression", "SOR > 3.5", "--filter-name", "SOR3",
            "--filter-expression", "FS > 60.0", "--filter-name", "FS60",
            "--filter-expression", "MQ < 40.0", "--filter-name", "MQ40",
            "--filter-expression", "DP < 10", "--filter-name", "DP10",
            "--filter-expression", "MQRankSum < -12.5", "--filter-name",
            "MQRankSum-12.5", "--filter-expression", "ReadPosRankSum < -8.0",
            "--filter-name", "ReadPosRankSum-8", "--output",
            vcf_hard_filtered_output_file
        ]

    elif select_type == "INDEL":
        extension = ".indel.hf.vcf"
        vcf_hard_filtered_output_file = selected_vcf_file_name + extension
        cmd = [
            "gatk", "VariantFiltration", "--variant", input_vcf,
            "--filter-expression", "QD < 2.0", "--filter-name", "QD2",
            "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30",
            "--filter-expression", "SOR > 10.0", "--filter-name", "SOR10",
            "--filter-expression", "FS > 200.0", "--filter-name", "FS200",
            "--filter-expression", "ReadPosRankSum < -20.0", "--filter-name",
            "ReadPosRankSum-20", "--output", vcf_hard_filtered_output_file
        ]
    else:
        print(RED + BOLD + "Choose a correct type to filter" + END_FORMATTING)

    execute_subprocess(cmd)
Example #6
0
def select_pass(raw_vcf):
    """
    Homemade script
    Filter a vcf file. Output a vcf file with PASS positions adding a .pass to the output file
    """
    input_vcf = os.path.abspath(raw_vcf)
    raw_vcf_file_name = (".").join(input_vcf.split(".")[:-1])

    extension = ".pass.vcf"
    vcf_selected_output_file = raw_vcf_file_name + extension

    check_file_exists(input_vcf)

    with open(input_vcf, "r") as f:
        with open(vcf_selected_output_file, "w") as f1:
            for line in f:
                if line.startswith("#"):
                    f1.write(line)
                else:
                    if line.split("\t")[6] == "PASS":
                        f1.write(line)
Example #7
0
def ddtb_add(input_folder, output_filename, recalibrate=False, sample_filter=False, vcf_suffix=".combined.hf.SNP.final.vcf" ):
    directory = os.path.abspath(input_folder)
    output_filename = os.path.abspath(output_filename)

    #Make sure output exist to force change name
    if os.path.isfile(output_filename):
        print(YELLOW + "ERROR: " + BOLD + "output database EXIST, choose a different name or manually delete" + END_FORMATTING)
        sys.exit(1)

    final_ddbb = blank_database()
    sample_filter_list = []


    #Handle sample filter
    if sample_filter == False:
        sample_filter_list = [x.split(".")[0] for x in os.listdir(input_folder) if x.endswith(vcf_suffix)]
    else:
        if os.path.isfile(sample_filter):
            with open(sample_filter, 'r') as f:
                for line in f:
                    sample_filter_list.append(line.strip())
        else:
            "Sample file don't exist"
            sys.exit(1)
    
    print(sample_filter_list)

    if len(sample_filter_list) < 1:
        print("prease provide 2 or more samples")
        sys.exit(1)

    #print("Previous final database contains %s rows and %s columns\n" % final_ddbb.shape)
    print("The directory selected is: %s" % directory)
    

    all_samples = 0
    new_samples = 0
    for filename in os.listdir(directory):
        if not filename.startswith('.') and filename.endswith(vcf_suffix):
            
            all_samples = all_samples + 1
            positions_shared = []
            positions_added = []
            
            sample = filename.split(".")[0] #Manage sample name

            if sample in sample_filter_list:
                print("\nThe file is: %s" % filename)

            
                file = os.path.join(directory, filename) #Whole file path
                check_file_exists(file) #Manage file[s]. Check if file exist and is greater than 0

                new_sample = import_VCF4_to_pandas(file) #Import files in annotated vcf format

                #Check if sample exist
                ######################
                if sample not in final_ddbb.columns.tolist():
                    print("Adding new sample %s to %s" % (sample, os.path.basename(output_filename)))
                    new_samples = new_samples + 1
                    new_colum_index = len(final_ddbb.columns) #extract the number of columns to insert a new one
                    #final_ddbb[sample] = sample #adds a new column but fills all blanks with the value sample
                    final_ddbb.insert(new_colum_index, sample, 0) #add a new column with defauls values = 0
                    
                    #Check if position exist
                    ########################
                    for position in new_sample['POS'].unique(): #extract first column in file
                        
                        if position not in final_ddbb["Position"].values:
                            positions_added.append(int(position)) #Count new positions for stats
                            
                            new_row = len(final_ddbb.index)
                            final_ddbb.loc[new_row,'Position'] = int(position)
                            final_ddbb.loc[new_row,'Samples'] = sample
                            final_ddbb.loc[new_row,'N'] = int(1)
                            final_ddbb.loc[new_row,sample] = str(1)
                        else:
                            positions_shared.append(int(position)) #Count shared positions for stats
                            
                            #Check whether the column matches the value and retrieve the first position [0]
                            #of the object index generated
                            index_position = final_ddbb.index[final_ddbb["Position"] == int(position)][0]
                            #Add sample to corresponding cell [position, samples]
                            number_samples_with_position = final_ddbb.loc[index_position,'N']
                            names_samples_with_position = final_ddbb.loc[index_position,'Samples']
                            new_names_samples = names_samples_with_position + "," + sample
                            #Sum 1 to the numbes of samples containing the position
                            final_ddbb.loc[index_position,'N'] = number_samples_with_position + 1
                            final_ddbb.loc[index_position,'Samples'] = new_names_samples
                            final_ddbb.loc[index_position,sample] = str(1) #Add "1" in cell with correct position vs sample (indicate present)

                    print("\nSAMPLE:\t%s\nTOTAL Variants:\t%s\nShared Variants:\t%s\nNew Variants:\t%s\n"
                    % (sample, len(new_sample.index), len(positions_shared), len(positions_added)))
                else:
                    print(YELLOW + "The sample " + sample + " ALREADY exist" + END_FORMATTING)

    #final_ddbb = final_ddbb.fillna(0).sort_values("Position") 
    final_ddbb["Position"] = final_ddbb["Position"].astype(int) #TO REMOVE when nucleotides are added
    final_ddbb['N'] = final_ddbb['N'].astype(int)
    #final_ddbb = final_ddbb.reset_index(drop=True)

    print("Final database now contains %s rows and %s columns" % final_ddbb.shape)
    if recalibrate == False:
        output_filename = output_filename + ".tsv"
        final_ddbb.to_csv(output_filename, sep='\t', index=False)
    else:
        recalibrate = os.path.abspath(recalibrate)
        if os.path.exists(recalibrate):
            recalibrate_params = extract_recalibrate_params(recalibrate)
            print("\n" + MAGENTA + "Recalibration selected" + END_FORMATTING)
            print(output_filename)
            output_filename = output_filename + ".revised.tsv"

            final_ddbb_revised = recalibrate_ddbb_vcf(final_ddbb, recalibrate_params[0], recalibrate_params[1], recalibrate_params[2])
            """
            if args.reference and args.reference != False:
                final_ddbb_revised = recalibrate_ddbb_vcf(final_ddbb, recalibrate_params[0], recalibrate_params[1], args.reference)
                
            else:
                final_ddbb_revised = recalibrate_ddbb_vcf(final_ddbb, recalibrate_params[0], recalibrate_params[1], recalibrate_params[2])
            """
            final_ddbb_revised.to_csv(output_filename, sep='\t', index=False)
        else:
            print("The directory supplied for recalculation does not exixt")
            sys.exit(1)
    print(output_filename)

    #Create small report with basic count
    #####################################
            
    print("\n" + GREEN + "Position check Finished" + END_FORMATTING)
    print(GREEN + "Added " + str(new_samples) + " samples out of " + str(all_samples) + END_FORMATTING + "\n")
Example #8
0
def main():
    """
    Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python
    """

    # ARGUMENTS

    def get_arguments():

        parser = argparse.ArgumentParser(
            prog='covidma.py',
            description=
            'Pipeline to call variants (SNVs) with any non model organism. Specialised in SARS-CoV-2'
        )

        input_group = parser.add_argument_group('Input', 'Input parameters')

        input_group.add_argument(
            '-i',
            '--input',
            dest="input_dir",
            metavar="input_directory",
            type=str,
            required=True,
            help='REQUIRED.Input directory containing all fast[aq] files')
        input_group.add_argument('-r',
                                 '--reference',
                                 metavar="reference",
                                 type=str,
                                 required=True,
                                 help='REQUIRED. File to map against')
        input_group.add_argument(
            '-a',
            '--annotation',
            metavar="annotation",
            type=str,
            required=True,
            help='REQUIRED. gff3 file to annotate variants')
        input_group.add_argument('-s',
                                 '--sample',
                                 metavar="sample",
                                 type=str,
                                 required=False,
                                 help='Sample to identify further files')
        input_group.add_argument(
            '-L',
            '--sample_list',
            type=str,
            required=False,
            help='Sample names to analyse only in the file supplied')
        input_group.add_argument(
            '-p',
            '--primers',
            type=str,
            default=
            '/home/laura/DATABASES/Anotacion/COVID/primers/nCoV-2019.bed',
            required=False,
            help='Bed file including primers to trim')

        quality_group = parser.add_argument_group(
            'Quality parameters', 'parameters for diferent triming conditions')

        quality_group.add_argument(
            '-c',
            '--coverage20',
            type=int,
            default=90,
            required=False,
            help=
            'Minimum percentage of coverage at 20x to clasify as uncovered (Default 90)'
        )
        quality_group.add_argument('-n',
                                   '--min_snp',
                                   type=int,
                                   required=False,
                                   default=1,
                                   help='SNP number to pass quality threshold')

        output_group = parser.add_argument_group(
            'Output', 'Required parameter to output results')

        output_group.add_argument(
            '-o',
            '--output',
            type=str,
            required=True,
            help='REQUIRED. Output directory to extract all results')
        output_group.add_argument(
            '-C',
            '--noclean',
            required=False,
            action='store_false',
            help='Clean unwanted files for standard execution')

        params_group = parser.add_argument_group(
            'Parameters', 'parameters for diferent stringent conditions')

        params_group.add_argument('-T',
                                  '--threads',
                                  type=str,
                                  dest="threads",
                                  required=False,
                                  default=16,
                                  help='Threads to use')
        params_group.add_argument('-M',
                                  '--memory',
                                  type=str,
                                  dest="memory",
                                  required=False,
                                  default=32,
                                  help='Max memory to use')

        annot_group = parser.add_argument_group(
            'Annotation', 'parameters for variant annotation')

        annot_group.add_argument('-B',
                                 '--annot_bed',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='bed file to annotate')
        annot_group.add_argument('-V',
                                 '--annot_vcf',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='vcf file to annotate')
        annot_group.add_argument('-A',
                                 '--annot_aa',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='aminoacid file to annotate')
        annot_group.add_argument('-R',
                                 '--remove_bed',
                                 type=str,
                                 default=False,
                                 required=False,
                                 help='BED file with positions to remove')
        annot_group.add_argument(
            '--mash_database',
            type=str,
            required=False,
            default=False,
            help='MASH ncbi annotation containing all species database')
        annot_group.add_argument('--snpeff_database',
                                 type=str,
                                 required=False,
                                 default='NC_045512.2',
                                 help='snpEFF annotation database')

        compare_group = parser.add_argument_group(
            'Compare', 'parameters for compare_snp')

        compare_group.add_argument('-S',
                                   '--only_snp',
                                   required=False,
                                   action='store_true',
                                   help='Use INDELS while comparing')

        arguments = parser.parse_args()

        return arguments

    args = get_arguments()

    ######################################################################
    #####################START PIPELINE###################################
    ######################################################################
    output = os.path.abspath(args.output)
    group_name = output.split("/")[-1]
    reference = os.path.abspath(args.reference)
    annotation = os.path.abspath(args.annotation)

    # LOGGING
    # Create log file with date and time
    right_now = str(datetime.datetime.now())
    right_now_full = "_".join(right_now.split(" "))
    log_filename = group_name + "_" + right_now_full + ".log"
    log_folder = os.path.join(output, 'Logs')
    check_create_dir(log_folder)
    log_full_path = os.path.join(log_folder, log_filename)

    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    formatter = logging.Formatter('%(asctime)s:%(message)s')

    file_handler = logging.FileHandler(log_full_path)
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    # stream_handler.setFormatter(formatter)

    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)

    logger.info("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " +
                group_name + END_FORMATTING)

    today = str(datetime.date.today())

    logger.info("ARGUMENTS:")
    logger.info(str(args))

    # Obtain all R1 and R2 from folder
    r1, r2 = extract_read_list(args.input_dir)

    # Check if there are samples to filter out
    sample_list_F = []
    if args.sample_list == None:
        logger.info("\n" + "No samples to filter")
        for r1_file, r2_file in zip(r1, r2):
            sample = extract_sample(r1_file, r2_file)
            sample_list_F.append(sample)
    else:
        logger.info("samples will be filtered")
        sample_list_F = file_to_list(args.sample_list)

    new_samples = check_reanalysis(args.output, sample_list_F)

    logger.info("\n%d samples will be analysed: %s" %
                (len(new_samples), ",".join(new_samples)))

    #PREPARE REFERENCE FOR MAPPING + FAI + DICT #########
    #####################################################

    # picard_dictionary(args)
    samtools_faidx(args)

    #DECLARE FOLDERS CREATED IN PIPELINE ################
    #AND KEY FILES ######################################
    #####################################################
    # Annotation related parameters
    # script_dir = os.path.dirname(os.path.realpath(__file__))

    # Output related
    out_qc_dir = os.path.join(output, "Quality")
    out_qc_pre_dir = os.path.join(out_qc_dir, "raw")  # subfolder
    out_qc_post_dir = os.path.join(out_qc_dir, "processed")  # subfolder
    out_trim_dir = os.path.join(output, "Trimmed")
    out_map_dir = os.path.join(output, "Bam")
    out_variant_dir = os.path.join(output, "Variants")
    out_variant_ivar_dir = os.path.join(out_variant_dir,
                                        "ivar_raw")  # subfolder
    out_filtered_ivar_dir = os.path.join(out_variant_dir,
                                         "ivar_filtered")  # subfolder
    out_consensus_dir = os.path.join(output, "Consensus")
    out_consensus_ivar_dir = os.path.join(out_consensus_dir,
                                          "ivar")  # subfolder

    out_stats_dir = os.path.join(output, "Stats")
    out_stats_bamstats_dir = os.path.join(out_stats_dir,
                                          "Bamstats")  # subfolder
    out_stats_coverage_dir = os.path.join(out_stats_dir,
                                          "Coverage")  # subfolder
    out_compare_dir = os.path.join(output, "Compare")

    out_annot_dir = os.path.join(output, "Annotation")
    out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff")  # subfolder
    out_annot_pangolin_dir = os.path.join(out_annot_dir,
                                          "pangolin")  # subfolder
    out_annot_user_dir = os.path.join(out_annot_dir, "user")  # subfolder
    out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa")  # subfolder

    new_sample_number = 0

    for r1_file, r2_file in zip(r1, r2):
        # EXtract sample name
        sample = extract_sample(r1_file, r2_file)
        args.sample = sample
        if sample in sample_list_F:

            sample_number = str(sample_list_F.index(sample) + 1)
            sample_total = str(len(sample_list_F))

            out_markdup_trimmed_name = sample + ".rg.markdup.trimmed.sorted.bam"
            output_markdup_trimmed_file = os.path.join(
                out_map_dir, out_markdup_trimmed_name)

            if sample in new_samples:
                new_sample_number = str(int(new_sample_number) + 1)
                new_sample_total = str(len(new_samples))
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" +
                            " (" + new_sample_number + "/" + new_sample_total +
                            ")" + END_FORMATTING)
            else:
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" +
                            END_FORMATTING)

            if not os.path.isfile(output_markdup_trimmed_file):

                args.r1_file = r1_file
                args.r2_file = r2_file

                ##############START PIPELINE#####################
                #################################################

                # INPUT ARGUMENTS
                ################
                check_file_exists(r1_file)
                check_file_exists(r2_file)

                args.output = os.path.abspath(args.output)
                check_create_dir(args.output)

                # QUALITY CHECK in RAW with fastqc
                ######################################################
                check_create_dir(out_qc_dir)

                out_qc_raw_name_r1 = (".").join(
                    r1_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html'
                out_qc_raw_name_r2 = (".").join(
                    r2_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html'
                output_qc_raw_file_r1 = os.path.join(out_qc_pre_dir,
                                                     out_qc_raw_name_r1)
                output_qc_raw_file_r2 = os.path.join(out_qc_pre_dir,
                                                     out_qc_raw_name_r2)

                if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile(
                        output_qc_raw_file_r2):
                    logger.info(YELLOW + DIM + output_qc_raw_file_r1 +
                                " EXIST\nOmmiting QC for sample " + sample +
                                END_FORMATTING)
                else:
                    logger.info(GREEN + "Checking quality in sample " +
                                sample + END_FORMATTING)
                    logger.info("R1: " + r1_file + "\nR2: " + r2_file)
                    fastqc_quality(r1_file, r2_file, out_qc_pre_dir,
                                   args.threads)
                """
                TODO: Human filter
                """

                # QUALITY TRIMMING AND ADAPTER REMOVAL WITH fastp
                ###################################################
                out_trim_name_r1 = sample + ".trimmed_R1.fastq.gz"
                out_trim_name_r2 = sample + ".trimmed_R2.fastq.gz"
                output_trimming_file_r1 = os.path.join(out_trim_dir,
                                                       out_trim_name_r1)
                output_trimming_file_r2 = os.path.join(out_trim_dir,
                                                       out_trim_name_r2)

                if os.path.isfile(output_trimming_file_r1) and os.path.isfile(
                        output_trimming_file_r2):
                    logger.info(YELLOW + DIM + output_trimming_file_r1 +
                                " EXIST\nOmmiting Trimming for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Trimming sample " + sample +
                                END_FORMATTING)
                    fastp_trimming(r1_file,
                                   r2_file,
                                   sample,
                                   out_trim_dir,
                                   threads=args.threads,
                                   min_qual=20,
                                   window_size=10,
                                   min_len=35)

                # QUALITY CHECK in TRIMMED with fastqc
                ######################################################
                check_create_dir(out_qc_dir)

                out_qc_pos_r1 = sample + ".trimmed_R1_fastqc.html"
                out_qc_pos_r2 = sample + ".trimmed_R2_fastqc.html"
                output_qc_precessed_file_r1 = os.path.join(
                    out_qc_post_dir, out_qc_pos_r1)
                output_qc_precessed_file_r2 = os.path.join(
                    out_qc_post_dir, out_qc_pos_r2)

                if os.path.isfile(
                        output_qc_precessed_file_r1) and os.path.isfile(
                            output_qc_precessed_file_r2):
                    logger.info(YELLOW + DIM + output_qc_raw_file_r1 +
                                " EXIST\nOmmiting QC for sample " + sample +
                                END_FORMATTING)
                else:
                    logger.info(GREEN +
                                "Checking quality in processed sample " +
                                sample + END_FORMATTING)
                    logger.info("R1: " + output_trimming_file_r1 + "\nR2: " +
                                output_trimming_file_r2)
                    fastqc_quality(output_trimming_file_r1,
                                   output_trimming_file_r2, out_qc_post_dir,
                                   args.threads)

                # MAPPING WITH BWA - SAM TO SORTED BAM - ADD HEADER SG
                #####################################################
                out_map_name = sample + ".rg.sorted.bam"
                output_map_file = os.path.join(out_map_dir, out_map_name)

                if os.path.isfile(output_map_file):
                    logger.info(YELLOW + DIM + output_map_file +
                                " EXIST\nOmmiting Mapping for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Mapping sample " + sample +
                                END_FORMATTING)
                    logger.info("R1: " + output_trimming_file_r1 + "\nR2: " +
                                output_trimming_file_r2 + "\nReference: " +
                                reference)
                    bwa_mapping(output_trimming_file_r1,
                                output_trimming_file_r2,
                                reference,
                                sample,
                                out_map_dir,
                                threads=args.threads)
                    sam_to_index_bam(sample,
                                     out_map_dir,
                                     output_trimming_file_r1,
                                     threads=args.threads)

                #MARK DUPLICATES WITH PICARDTOOLS ###################
                #####################################################
                out_markdup_name = sample + ".rg.markdup.sorted.bam"
                output_markdup_file = os.path.join(out_map_dir,
                                                   out_markdup_name)

                if os.path.isfile(output_markdup_file):
                    logger.info(YELLOW + DIM + output_markdup_file +
                                " EXIST\nOmmiting Duplucate Mark for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Marking Dupes in sample " + sample +
                                END_FORMATTING)
                    logger.info("Input Bam: " + output_map_file)
                    picard_markdup(output_map_file)

                #TRIM PRIMERS WITH ivar trim ########################
                #####################################################

                if os.path.isfile(output_markdup_trimmed_file):
                    logger.info(YELLOW + DIM + output_markdup_trimmed_file +
                                " EXIST\nOmmiting Duplucate Mark for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Trimming primers in sample " +
                                sample + END_FORMATTING)
                    logger.info("Input Bam: " + output_markdup_file)
                    ivar_trim(output_markdup_file,
                              args.primers,
                              sample,
                              min_length=30,
                              min_quality=20,
                              sliding_window_width=4)
            else:
                logger.info(
                    YELLOW + DIM + output_markdup_trimmed_file +
                    " EXIST\nOmmiting BAM mapping and BAM manipulation in sample "
                    + sample + END_FORMATTING)

            ########################END OF MAPPING AND BAM MANIPULATION#####################################################################
            ################################################################################################################################

            #VARIANT CALLING WTIH ivar variants##################
            #####################################################
            check_create_dir(out_variant_dir)
            out_ivar_variant_name = sample + ".tsv"
            out_ivar_variant_file = os.path.join(out_variant_ivar_dir,
                                                 out_ivar_variant_name)

            if os.path.isfile(out_ivar_variant_file):
                logger.info(YELLOW + DIM + out_ivar_variant_file +
                            " EXIST\nOmmiting Variant call for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Calling variants with ivar in sample " +
                            sample + END_FORMATTING)
                ivar_variants(reference,
                              output_markdup_trimmed_file,
                              out_variant_dir,
                              sample,
                              annotation,
                              min_quality=15,
                              min_frequency_threshold=0.01,
                              min_depth=1)

            #VARIANT FILTERING ##################################
            #####################################################
            check_create_dir(out_filtered_ivar_dir)
            out_ivar_filtered_file = os.path.join(out_filtered_ivar_dir,
                                                  out_ivar_variant_name)

            if os.path.isfile(out_ivar_filtered_file):
                logger.info(YELLOW + DIM + out_ivar_filtered_file +
                            " EXIST\nOmmiting Variant filtering for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Filtering variants in sample " + sample +
                            END_FORMATTING)
                filter_tsv_variants(out_ivar_variant_file,
                                    out_filtered_ivar_dir,
                                    min_frequency=0.7,
                                    min_total_depth=10,
                                    min_alt_dp=4,
                                    is_pass=True,
                                    only_snp=False)

            #CREATE CONSENSUS with ivar consensus##################
            #######################################################
            check_create_dir(out_consensus_dir)
            check_create_dir(out_consensus_ivar_dir)
            out_ivar_consensus_name = sample + ".fa"
            out_ivar_consensus_file = os.path.join(out_consensus_ivar_dir,
                                                   out_ivar_consensus_name)

            if os.path.isfile(out_ivar_consensus_file):
                logger.info(YELLOW + DIM + out_ivar_consensus_file +
                            " EXIST\nOmmiting Consensus for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Creating consensus with ivar in sample " +
                            sample + END_FORMATTING)
                ivar_consensus(output_markdup_trimmed_file,
                               out_consensus_ivar_dir,
                               sample,
                               min_quality=20,
                               min_frequency_threshold=0.8,
                               min_depth=20,
                               uncovered_character='N')
                logger.info(GREEN + "Replacing consensus header in " + sample +
                            END_FORMATTING)
                replace_consensus_header(out_ivar_consensus_file)

            ########################CREATE STATS AND QUALITY FILTERS########################################################################
            ################################################################################################################################
            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_dir)
            check_create_dir(out_stats_bamstats_dir)
            out_bamstats_name = sample + ".bamstats"
            out_bamstats_file = os.path.join(out_stats_bamstats_dir,
                                             out_bamstats_name)

            if os.path.isfile(out_bamstats_file):
                logger.info(YELLOW + DIM + out_bamstats_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample +
                            END_FORMATTING)
            else:
                logger.info(GREEN + "Creating bamstats in sample " + sample +
                            END_FORMATTING)
                create_bamstat(output_markdup_trimmed_file,
                               out_stats_bamstats_dir,
                               sample,
                               threads=args.threads)

            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_coverage_dir)
            out_coverage_name = sample + ".cov"
            out_coverage_file = os.path.join(out_stats_coverage_dir,
                                             out_coverage_name)

            if os.path.isfile(out_coverage_file):
                logger.info(YELLOW + DIM + out_coverage_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample +
                            END_FORMATTING)
            else:
                logger.info(GREEN + "Creating coverage in sample " + sample +
                            END_FORMATTING)
                create_coverage(output_markdup_trimmed_file,
                                out_stats_coverage_dir, sample)

    # fastqc OUTPUT FORMAT FOR COMPARISON
    ######################################################
    logger.info(GREEN + "Creating summary report for quality result " +
                END_FORMATTING)
    # format_html_image(out_qc_dir)

    # coverage OUTPUT SUMMARY
    ######################################################
    logger.info(GREEN + "Creating summary report for coverage result " +
                END_FORMATTING)
    obtain_group_cov_stats(out_stats_coverage_dir, group_name)

    # READS and VARIANTS OUTPUT SUMMARY
    ######################################################
    logger.info(GREEN + "Creating overal summary report " + END_FORMATTING)
    obtain_overal_stats(output, group_name)

    # REMOVE UNCOVERED
    ##############################################################################################################################
    logger.info(GREEN + "Removing low quality samples" + END_FORMATTING)
    # remove_low_quality(output, min_percentage_20x=args.coverage20,
    #                   min_hq_snp=args.min_snp, type_remove='Uncovered')

    #ANNOTATION WITH SNPEFF, USER INOUT AND PANGOLIN ####
    #####################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " +
                group_name + END_FORMATTING + "\n")
    check_create_dir(out_annot_dir)
    check_create_dir(out_annot_snpeff_dir)
    check_create_dir(out_annot_pangolin_dir)
    # SNPEFF
    if args.snpeff_database != False:
        # CHANGE FOR RAW/FILTERED ANNOTATION
        for root, _, files in os.walk(out_filtered_ivar_dir):
            if root == out_filtered_ivar_dir:  # CHANGE FOR RAW/FILTERED ANNOTATION
                for name in files:
                    if name.endswith('.tsv'):
                        sample = name.split('.')[0]
                        filename = os.path.join(root, name)
                        out_annot_file = os.path.join(out_annot_snpeff_dir,
                                                      sample + ".annot")
                        if os.path.isfile(out_annot_file):
                            logger.info(
                                YELLOW + DIM + out_annot_file +
                                " EXIST\nOmmiting snpEff Annotation for sample "
                                + sample + END_FORMATTING)
                        else:
                            logger.info(GREEN +
                                        "Annotating sample with snpEff: " +
                                        sample + END_FORMATTING)
                            output_vcf = os.path.join(out_annot_snpeff_dir,
                                                      sample + '.vcf')
                            annotate_snpeff(filename,
                                            output_vcf,
                                            out_annot_file,
                                            database=args.snpeff_database)
    # USER DEFINED
    if not args.annot_bed and not args.annot_vcf:
        logger.info(YELLOW + BOLD +
                    "Ommiting User Annotation, no BED or VCF files supplied" +
                    END_FORMATTING)
    else:
        check_create_dir(out_annot_user_dir)
        # CHANGE FOR RAW/FILTERED ANNOTATION
        for root, _, files in os.walk(out_variant_ivar_dir):
            if root == out_variant_ivar_dir:  # CHANGE FOR RAW/FILTERED ANNOTATION
                for name in files:
                    if name.endswith('.tsv'):
                        sample = name.split('.')[0]
                        logger.info(
                            'User bed/vcf annotation in sample {}'.format(
                                sample))
                        filename = os.path.join(root, name)
                        out_annot_file = os.path.join(out_annot_user_dir,
                                                      sample + ".tsv")
                        user_annotation(filename,
                                        out_annot_file,
                                        vcf_files=args.annot_vcf,
                                        bed_files=args.annot_bed)

    # USER AA DEFINED
    if not args.annot_aa:
        logger.info(YELLOW + BOLD +
                    "Ommiting User aa Annotation, no AA files supplied" +
                    END_FORMATTING)
    else:
        check_create_dir(out_annot_user_aa_dir)
        for root, _, files in os.walk(out_annot_snpeff_dir):
            if root == out_annot_snpeff_dir:
                for name in files:
                    if name.endswith('.annot'):
                        sample = name.split('.')[0]
                        logger.info(
                            'User aa annotation in sample {}'.format(sample))
                        filename = os.path.join(root, name)
                        out_annot_aa_file = os.path.join(
                            out_annot_user_aa_dir, sample + ".tsv")
                        if os.path.isfile(out_annot_aa_file):
                            user_annotation_aa(out_annot_aa_file,
                                               out_annot_aa_file,
                                               aa_files=args.annot_aa)
                        else:
                            user_annotation_aa(filename,
                                               out_annot_aa_file,
                                               aa_files=args.annot_aa)

    # PANGOLIN
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=args.threads) as executor:
        futures_pangolin = []

        for root, _, files in os.walk(out_consensus_ivar_dir):
            if root == out_consensus_ivar_dir:
                for name in files:
                    if name.endswith('.fa'):
                        sample = name.split('.')[0]
                        filename = os.path.join(root, name)
                        out_pangolin_filename = sample + ".lineage.csv"
                        out_pangolin_file = os.path.join(
                            out_annot_pangolin_dir, out_pangolin_filename)
                        if os.path.isfile(out_pangolin_file):
                            logger.info(
                                YELLOW + DIM + out_pangolin_file +
                                " EXIST\nOmmiting Lineage for  sample " +
                                sample + END_FORMATTING)
                        else:
                            logger.info(GREEN +
                                        "Obtaining Lineage in sample " +
                                        sample + END_FORMATTING)
                            future = executor.submit(annotate_pangolin,
                                                     filename,
                                                     out_annot_pangolin_dir,
                                                     out_pangolin_filename,
                                                     threads=args.threads,
                                                     max_ambig=0.6)
                            futures_pangolin.append(future)
                for future in concurrent.futures.as_completed(
                        futures_pangolin):
                    logger.info(future.result())
                    # annotate_pangolin(filename, out_annot_pangolin_dir,
                    #                out_pangolin_filename, threads=args.threads, max_ambig=0.6)

    # USER AA TO HTML
    annotated_samples = []
    logger.info('Adapting annotation to html in {}'.format(group_name))
    for root, _, files in os.walk(out_annot_user_aa_dir):
        if root == out_annot_user_aa_dir:
            for name in files:
                if name.endswith('.tsv'):
                    sample = name.split('.')[0]
                    annotated_samples.append(sample)
                    filename = os.path.join(root, name)
                    annotation_to_html(filename, sample)
    annotated_samples = [str(x) for x in annotated_samples]
    report_samples_html_all = report_samples_html.replace(
        'ALLSAMPLES', ('","').join(annotated_samples))  # NEW
    with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'),
              'w+') as f:
        f.write(report_samples_html_all)

    # SNP COMPARISON using tsv variant files
    ######################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    check_create_dir(out_compare_dir)
    folder_compare = today + "_" + group_name
    path_compare = os.path.join(out_compare_dir, folder_compare)
    check_create_dir(path_compare)
    full_path_compare = os.path.join(path_compare, group_name)

    # ddtb_add(out_filtered_ivar_dir, full_path_compare)
    compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv"
    compare_snp_matrix_INDEL = full_path_compare + ".revised_INDEL.final.tsv"
    compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv"
    compare_snp_matrix_INDEL_intermediate = full_path_compare + \
        ".revised_INDEL_intermediate.tsv"
    recalibrated_snp_matrix_intermediate = ddbb_create_intermediate(
        out_variant_ivar_dir,
        out_stats_coverage_dir,
        min_freq_discard=0.1,
        min_alt_dp=4,
        only_snp=args.only_snp)
    recalibrated_snp_matrix_intermediate.to_csv(
        compare_snp_matrix_recal_intermediate, sep="\t", index=False)
    compare_snp_matrix_INDEL_intermediate_df = remove_position_range(
        recalibrated_snp_matrix_intermediate)
    compare_snp_matrix_INDEL_intermediate_df.to_csv(
        compare_snp_matrix_INDEL_intermediate, sep="\t", index=False)
    recalibrated_revised_df = revised_df(recalibrated_snp_matrix_intermediate,
                                         path_compare,
                                         min_freq_include=0.7,
                                         min_threshold_discard_sample=0.07,
                                         min_threshold_discard_position=0.4,
                                         remove_faulty=True,
                                         drop_samples=True,
                                         drop_positions=True)
    recalibrated_revised_df.to_csv(compare_snp_matrix_recal,
                                   sep="\t",
                                   index=False)
    recalibrated_revised_INDEL_df = revised_df(
        compare_snp_matrix_INDEL_intermediate_df,
        path_compare,
        min_freq_include=0.7,
        min_threshold_discard_sample=0.07,
        min_threshold_discard_position=0.4,
        remove_faulty=True,
        drop_samples=True,
        drop_positions=True)
    recalibrated_revised_INDEL_df.to_csv(compare_snp_matrix_INDEL,
                                         sep="\t",
                                         index=False)

    ddtb_compare(compare_snp_matrix_recal, distance=0)
    ddtb_compare(compare_snp_matrix_INDEL, distance=0, indel=True)

    logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    #####################CONSENSUS WITH REFINED CALL######
    ######################################################
    logger.info(GREEN + "Creating refined consensus" + END_FORMATTING)
    create_consensus(reference, compare_snp_matrix_recal,
                     out_stats_coverage_dir, out_consensus_dir)

    logger.info("\n\n" + MAGENTA + BOLD +
                "#####END OF PIPELINE COVID MULTI ANALYSIS#####" +
                END_FORMATTING + "\n")
Example #9
0
def ddtb_add(args):
    directory = os.path.abspath(args.folder)
    output_file = os.path.abspath(args.output_file)

    #Select NEW vs UPDATE
    if args.subtask == 'new' :
        final_ddbb = blank_database()
    elif args.subtask == 'update':
        update_database = os.path.abspath(args.update_database)
        if update_database == output_file:
            print(RED + "ERROR: " + END_FORMATTING + BOLD + "Pick a diferent name for the output database" + END_FORMATTING)
            sys.exit(1)
        else:
            final_ddbb = import_to_pandas(update_database, header=True)
    #Make sure output exist to force change name
    if os.path.isfile(output_file):
        print(YELLOW + "ERROR: " + BOLD + "output database EXIST, choose a different name or manually delete" + END_FORMATTING)
        sys.exit(1)

    
    print("Previous final database contains %s rows and %s columns\n" % final_ddbb.shape)
    print("The directory selected is: %s" % directory)
    

    all_samples = 0
    new_samples = 0
    for filename in os.listdir(directory):
        if not filename.startswith('.') and filename.endswith(args.suffix):
            print("\nThe file is: %s" % filename)
            
            all_samples = all_samples + 1
            positions_shared = []
            positions_added = []
            
            sample = filename.split(".")[0] #Manage sample name
            
            file = os.path.join(directory, filename) #Whole file path
            check_file_exists(file) #Manage file[s]. Check if file exist and is greater than 0

            new_sample = import_VCF4_to_pandas(file) #Import files in annotated vcf format

            #Handle each new_sample
            #print("This file contains %s SNPs" % len(new_sample.index))
            
            #Check if sample exist
            ######################
            if sample not in final_ddbb.columns.tolist():
                print("Adding new sample %s to %s" % (sample, os.path.basename(args.output_file)))
                new_samples = new_samples + 1
                new_colum_index = len(final_ddbb.columns) #extract the number of columns to insert a new one
                #final_ddbb[sample] = sample #adds a new column but fills all blanks with the value sample
                final_ddbb.insert(new_colum_index, sample, 0) #add a new column with defauls values = 0
                
                #Check if position exist
                ########################
                for position in new_sample['POS'].unique(): #extract first column in file
                    
                    if position not in final_ddbb["Position"].values:
                        positions_added.append(position) #Count new positions for stats
                        
                        new_row = len(final_ddbb.index)
                        final_ddbb.loc[new_row,'Position'] = position
                        final_ddbb.loc[new_row,'Samples'] = sample
                        final_ddbb.loc[new_row,'N'] = int(1)
                        final_ddbb.loc[new_row,sample] = str(1)
                        
                    else:
                        positions_shared.append(position) #Count shared positions for stats
                        
                        #Check whether the column matches the value and retrieve the first position [0]
                        #of the object index generated
                        index_position = final_ddbb.index[final_ddbb["Position"] == position][0]
                        #Add sample to corresponding cell [position, samples]
                        number_samples_with_position = final_ddbb.loc[index_position,'N']
                        names_samples_with_position = final_ddbb.loc[index_position,'Samples']
                        new_names_samples = names_samples_with_position + "," + sample
                        #Sum 1 to the numbes of samples containing the position
                        final_ddbb.loc[index_position,'N'] = number_samples_with_position + 1
                        final_ddbb.loc[index_position,'Samples'] = new_names_samples
                        final_ddbb.loc[index_position,sample] = str(1) #Add "1" in cell with correct position vs sample (indicate present)

                print("\nSAMPLE:\t%s\nTOTAL Variants:\t%s\nShared Variants:\t%s\nNew Variants:\t%s\n"
                % (sample, len(new_sample.index), len(positions_shared), len(positions_added)))
            else:
                print(YELLOW + "The sample " + sample + " ALREADY exist" + END_FORMATTING)


    final_ddbb = final_ddbb.fillna(0).sort_values("Position") #final_ddbb = final_ddbb["Position"].astype(int)
    final_ddbb['N'] = final_ddbb['N'].astype(int)
    final_ddbb = final_ddbb.reset_index(drop=True)

    print("Final database now contains %s rows and %s columns" % final_ddbb.shape)
    if args.recalibrate == False:
        final_ddbb.to_csv(output_file, sep='\t', index=False)
    else:
        args.recalibrate = os.path.abspath(args.recalibrate)
        if os.path.exists(args.recalibrate):
            recalibrate_params = extract_recalibrate_params(args.recalibrate)
            print("\n" + MAGENTA + "Recalibration selected" + END_FORMATTING)
            output_file = (".").join(output_file.split(".")[:-1]) + ".revised.tsv"
            final_ddbb_revised = recalibrate_ddbb_vcf(final_ddbb, recalibrate_params[0], recalibrate_params[1], recalibrate_params[2])
            final_ddbb_revised.to_csv(output_file, sep='\t', index=False)
        else:
            print("The directory supplied for recalculation does not exixt")
            sys.exit(1)

    #Create small report with basic count
    #####################################
            
    print("\n" + GREEN + "Position check Finished" + END_FORMATTING)
    print(GREEN + "Added " + str(new_samples) + " samples out of " + str(all_samples) + END_FORMATTING + "\n")
    
    #pd.set_option('display.precision', 0)
    #pd.reset_option('^display.', silent=True) #Reset options in case I mess up
Example #10
0
        output_bqsr_file = os.path.join(out_map_dir, out_bqsr_name)

        if not os.path.isfile(output_bqsr_file):

            args.r1_file = r1_file
            args.r2_file = r2_file

            print("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" +
                  sample_number + "/" + sample_total + ")" + END_FORMATTING)

            ##############START PIPELINE#####################
            #################################################

            #INPUT ARGUMENTS
            ################
            check_file_exists(args.r1_file)
            check_file_exists(args.r2_file)

            args.output = os.path.abspath(args.output)
            check_create_dir(args.output)
            #QUALITY CHECK
            ##############
            """
            TODO: Quality check 
            TODO: Human filter
            """

            #QUALITY TRIMMING AND ADAPTER REMOVAL WITH bbduk.sh
            ###################################################
            out_trim_name_r1 = sample + "_R1.clean.fastq.gz"
            out_trim_name_r2 = sample + "_R2.clean.fastq.gz"