Esempi in Python per getpandas

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: genodownload

Metodo/funzione: getpandas

Esempi su hotexamples.com: 4

getpandas in Python: 4 esempi trovati. Questi sono i migliori esempi reali in Python per genodownload.getpandas, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: genoharmonize.py Progetto: heatherself18/GenotypeProcessing

def local(geno_name, harmonizer_path, vcf_path, legend_path, fasta_path):
    # Using 1000 Genomes as a reference(based off Perl script by W.Rayner, 2015, wrayner @ well.ox.ac.uk)
    #   -Removes SNPs with MAF < 5% in study dataset
    #   -Removes SNPs not in 1000 Genomes Phase 3
    #   -Removes all A/T G/C SNPs with MAF > 40% in the reference data set
    #   -Removes all SNPs with an AF difference >0.2, between reference and dataset frequency file, frequency file is
    #         expected to be a plink frequency file with the same number of SNPs as the bim file
    #   -Removes duplicates that may be introduced with the position update
    #   -Removes indels #Need to figure out how to do this. Or even if it is necessary with plink files.
    #   -Removes SNPs with HWE p-value < 0.01
    #   -Updates the reference allele to match 1000G
    #   -Outputs new files per chromosome, in plink bed/bim/fam format.

    # Needed modules
    import sys
    import csv
    import gzip

    try:
        import pandas as pd
    except (ImportError, ModuleNotFoundError):
        import genodownload
        genodownload.getpandas()
        import pandas as pd

    try:
        import numpy as np
    except (ImportError, ModuleNotFoundError):
        import genodownload
        genodownload.getnumpy()
        import numpy as np

    # Get current working directory.
    orig_wd = os.getcwd()

    # Make new folder where the harmonized files will be located.
    if not os.path.exists('Harmonized_To_1000G'):
        os.makedirs('Harmonized_To_1000G')

    # Copy genotype files to new folder.
    shutil.copy2(geno_name + '.bed', 'Harmonized_To_1000G')
    shutil.copy2(geno_name + '.bim', 'Harmonized_To_1000G')
    shutil.copy2(geno_name + '.fam', 'Harmonized_To_1000G')

    # Switch to this directory.
    os.chdir('Harmonized_To_1000G')

    # Make the lists that we're going to need, since this is on a per chromosome basis.
    vcf_file_names = ['ALL.chr%d.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz' % x for x in
                      range(1, 23)]
    vcf_file_names.extend(['ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz'])
    legend_file_names = ['1000GP_Phase3_chr%d.legend.gz' % x for x in range(1, 23)]
    legend_file_names.extend(['1000GP_Phase3_chrX_NONPAR.legend.gz'])
    harmonized_geno_names = [geno_name + '_chr%d_Harmonized' % x for x in range(1, 24)]
    id_updates = [s + '_idUpdates.txt' for s in harmonized_geno_names]
    id_update_names = [s + '_idUpdates.txt' for s in harmonized_geno_names]
    snp_logs = [s + '_snpLog.log' for s in harmonized_geno_names]
    snp_log_names = [s + '_snpLog.log' for s in harmonized_geno_names]
    freq_file_names = [geno_name + '_chr%d_Harmonized.frq' % x for x in range(1, 24)]
    af_diff_removed_by_chr = ['chr%d_SNPsRemoved_AFDiff' % x for x in range(1, 24)]
    final_snps_by_chr = ['chr%d_SNPsKept' % x for x in range(1, 24)]
    final_snp_lists = ['chr%d_SNPsKept.txt' % x for x in range(1, 24)]
    af_checked_names = [geno_name + '_chr%d_HarmonizedTo1000G' % x for x in range(1, 24)]

    # Harmonize for each chromosome
    for i in range(0, len(vcf_file_names)):
        # Call genotype harmonizer for autosomes
        if i < 22:
            # Remove SNPs with HWE p-value < 0.01 and SNPs with MAF < 0.05
            subprocess.check_output([plink, '--bfile', geno_name, '--chr', str(i+1), '--hardy', '--hwe', '0.01',
                                     '--maf', '0.05', '--make-bed', '--out',
                                     geno_name + '_MAF_HWE_Filter_chr' + str(i+1)])
            subprocess.check_output('java -Xmx1g -jar "' + harmonizer_path + '/GenotypeHarmonizer.jar" $* --input '
                                    + geno_name + '_MAF_HWE_Filter_chr' + str(i+1) + ' --ref "'
                                    + os.path.join(vcf_path, vcf_file_names[i])
                                    + '" --refType VCF --update-id --debug --mafAlign 0 --update-reference-allele '
                                      '--outputType PLINK_BED --output ' + harmonized_geno_names[i], shell=True)
            subprocess.call(rm + geno_name + '_MAF_HWE_Filter_chr' + str(i + 1) + '.*', shell=True)
            id_updates[i] = pd.read_csv(id_update_names[i], sep='\t', header=0,
                                        dtype={'chr': str, 'pos': int, 'originalId': str, 'newId': str})
            snp_logs[i] = pd.read_table(snp_log_names[i], sep='\t', header=0,
                                        dtype={'chr': str, 'pos': int, 'id': str, 'alleles': str, 'action': str,
                                               'message': str})

        elif i == 22:
            # Special handling for chrX
            # Make list of females
            fam_file = pd.read_csv(geno_name + '.fam', sep=' ', header=None)
            females = fam_file.loc[fam_file[4] == 2]
            females[1].to_csv(geno_name + '_Females.txt', sep='\t', header=None, index=False)
            # Make hwe statistics using just females
            subprocess.check_output([plink, '--bfile', geno_name, '--chr', 'X', '--hardy', '--keep',
                                     geno_name + '_Females.txt', '--out', geno_name + '_chr23'])
            # Get list of SNPs with HWE p-values < 0.01
            hwe = pd.read_csv(geno_name + '_chr23.hwe', sep='\t', header=None, skiprows=1)
            hweremove = hwe.loc[hwe[8] <= 0.01]
            hweremove[1].to_csv(geno_name + '_chr23_RemHWE.txt', sep='\t', header=None, index=False)
            # Remove these from plink file
            subprocess.check_output([plink, '--bfile', geno_name, '--chr', 'X', '--maf', '0.05', '--exclude',
                                     geno_name + '_chr23_RemHWE.txt', '--make-bed', '--out',
                                     geno_name + '_MAF_HWE_Filter_chr23'])
            # Read chrX file into pandas
            bim_file = pd.read_csv(geno_name + '_MAF_HWE_Filter_chr23.bim', sep='\t', header=None)
            # Replace '23' with 'X', which is how genotype harmonizer calls X
            bim_file.iloc[:, 0].replace(23, 'X', inplace=True)
            # Write new genotype
            bim_file.to_csv(geno_name + '_MAF_HWE_Filter_chr23.bim', sep='\t', header=False, index=False, na_rep='NA')
            # Call genotype harmonizer for X chromosome
            subprocess.check_output('java -Xmx1g -jar "' + harmonizer_path + '/GenotypeHarmonizer.jar" $* --input '
                                    + geno_name + '_MAF_HWE_Filter_chr23 --ref "'
                                    + os.path.join(vcf_path, vcf_file_names[i])
                                    + '" --refType VCF --update-id --debug --mafAlign 0 --update-reference-allele '
                                      '--outputType PLINK_BED --output ' + harmonized_geno_names[i], shell=True)
            subprocess.call(rm + geno_name + '_MAF_HWE_Filter_chr23.*', shell=True)
            subprocess.call(rm + geno_name + '_chr23*', shell=True)
            subprocess.call(rm + geno_name + '_chr23_RemHWE.txt*', shell=True)

            id_updates[i] = pd.read_csv(id_update_names[i], sep='\t', header=0,
                                        dtype={'chr': str, 'pos': int, 'originalId': str, 'newId': str})
            snp_logs[i] = pd.read_csv(snp_log_names[i], sep='\t', header=0,
                                      dtype={'chr': str, 'pos': int, 'id': str, 'alleles': str, 'action': str,
                                             'message': str})
        else:
            print(Fore.RED + Style.BRIGHT)
            sys.exit("Something is wrong with the number/name of reference files")

    # Concatenate all of the id updates into one file.
    all_id_updates = pd.concat([id_updates[0], id_updates[1], id_updates[2], id_updates[3], id_updates[4],
                                id_updates[5], id_updates[6], id_updates[7], id_updates[8], id_updates[9],
                                id_updates[10], id_updates[11], id_updates[12], id_updates[13], id_updates[14],
                                id_updates[15], id_updates[16], id_updates[17], id_updates[18], id_updates[19],
                                id_updates[20], id_updates[21], id_updates[22]])
    # Write list to text file.
    all_id_updates.to_csv('Harmonization_ID_Updates.txt', sep='\t', header=True, index=False)

    # Remove the clutter
    if os.path.getsize('Harmonization_ID_Updates.txt') > 0:
        for f in id_update_names:
            subprocess.call([rm, f])

    all_snp_logs = pd.concat([snp_logs[0], snp_logs[1], snp_logs[2], snp_logs[3], snp_logs[4], snp_logs[5], snp_logs[6],
                              snp_logs[7], snp_logs[8], snp_logs[9], snp_logs[10], snp_logs[11], snp_logs[12],
                              snp_logs[13], snp_logs[14], snp_logs[15], snp_logs[16], snp_logs[17], snp_logs[18],
                              snp_logs[19], snp_logs[20], snp_logs[21], snp_logs[22]])
    # Write list to text file.
    all_snp_logs.to_csv('Harmonization_SNP_Logs.txt', sep='\t', header=True, index=False)

    # Remove the clutter
    if os.path.getsize('Harmonization_SNP_Logs.txt') > 0:
        for f in snp_log_names:
            subprocess.call([rm, f])

    # Now for each that was just harmonized, remove all SNPs with an allele (AF) difference > 0.2 since we are going to
    # use a global reference population between study dataset and all superpopulation allele frequencies. IF within 0.2
    # of any superpopulation frequency, keep variant. Frequency file is expected to be a Plink frequency file with the
    # same number of variants as the bim file.
    for i in range(0, len(harmonized_geno_names)):
        # Create plink file
        subprocess.check_output([plink, '--bfile', harmonized_geno_names[i], '--freq', '--out',
                                 harmonized_geno_names[i]])
        # Read freq file into python
        freq_file = pd.read_csv(freq_file_names[i], sep='\s+', header=0, usecols=[0, 1, 2, 3, 4],
                                dtype={'CHR': int, 'SNP': str, 'A1': str, 'A2': str, 'MAF': float})
        # Rename columns of freq file.
        freq_file.rename(columns={'A1': 'dataset_a1', 'A2': 'dataset_a2', 'MAF': 'dataset_a1_frq'}, inplace=True)
        # Calculate the frequency of the second allele, since it's not given in the freq file.
        freq_file['dataset_a2_frq'] = 1 - freq_file['dataset_a1_frq']
        # Read in bim file.
        bim_file = pd.read_csv(harmonized_geno_names[i] + '.bim', sep='\s+', header=None, usecols=[0, 1, 3],
                               names=['CHR', 'SNP', 'position'])
        # Merge frequency file with bim file to get position for each SNP
        freq_file_with_position = pd.merge(left=freq_file, right=bim_file, how='inner', on=['CHR', 'SNP'])
        # Read in legend file.
        legend_file = pd.read_csv(os.path.join(legend_path, legend_file_names[i]), compression="gzip", sep=" ",
                                  header=0, dtype={'id': str, 'position': int, 'a0': str, 'a1': str, 'TYPE': str,
                                                   'AFR': float, 'AMR': float, 'EAS': float, 'EUR': float, 'SAS': float,
                                                   'ALL': float})
        # Rename columns of legend file
        legend_file.rename(columns={'id': 'reference_id', 'a0': 'reference_a0', 'a1': 'reference_a1'}, inplace=True)

        # To Remove A/T or G/C SNPs in reference file that have an MAF > 40%, first identify which are AT/GC SNPs.
        legend_file['ATGC_SNP'] = np.where(
            ((legend_file['reference_a0'] == 'A') & (legend_file['reference_a1'] == 'T')) |
            ((legend_file['reference_a0'] == 'T') & (legend_file['reference_a1'] == 'A')) |
            ((legend_file['reference_a0'] == 'C') & (legend_file['reference_a1'] == 'G')) |
            ((legend_file['reference_a0'] == 'G') & (legend_file['reference_a1'] == 'C')), 'ATGC', 'Fine')
        # For each superpopulation, create a new column with the MAF. If the AF in that column is less than 0.5, then
        # that is the MAF, if not then 1-AF is MAF
        legend_file['AFR_MAF'] = np.where(legend_file['AFR'] < 0.5, legend_file['AFR'], 1 - legend_file['AFR'])
        legend_file['AMR_MAF'] = np.where(legend_file['AMR'] < 0.5, legend_file['AMR'], 1 - legend_file['AMR'])
        legend_file['EAS_MAF'] = np.where(legend_file['EAS'] < 0.5, legend_file['EAS'], 1 - legend_file['EAS'])
        legend_file['EUR_MAF'] = np.where(legend_file['EUR'] < 0.5, legend_file['EUR'], 1 - legend_file['EUR'])
        legend_file['SAS_MAF'] = np.where(legend_file['SAS'] < 0.5, legend_file['SAS'], 1 - legend_file['SAS'])

        # Create column in legend file with decision about whether to keep or remove SNP, if it is an ATGC SNP where
        # the MAF in all superpopulations is greater than 40%, then remove it.
        legend_file['MAF_Decision'] = np.where((legend_file['ATGC_SNP'] == 'ATGC') & (legend_file['AFR_MAF'] > 0.4) &
                                               (legend_file['AMR_MAF'] > 0.4) & (legend_file['EAS_MAF'] > 0.4) &
                                               (legend_file['EUR_MAF'] > 0.4) & (legend_file['SAS_MAF'] > 0.4),
                                               'Remove', 'Keep')

        # Make new legend file with just the SNPs that pass this threshold.
        legend_file = legend_file[legend_file['MAF_Decision'] == 'Keep']

        legend_file.drop(labels=['ATGC_SNP', 'AFR_MAF', 'AMR_MAF', 'EAS_MAF', 'EUR_MAF', 'SAS_MAF', 'MAF_Decision'],
                         axis=1, inplace=True)

        # Merge freq file with positions with legend file to get overlap. This file contains only SNPs with matches in
        # 1000G
        merged_file = pd.merge(left=freq_file_with_position, right=legend_file, how='inner', on='position')
        merged_file = merged_file[merged_file['TYPE'] == 'Biallelic_SNP']

        # The MAF column in the frq file is the allele frequency of the A1 allele (which is usually minor, but
        # possibly not in my case because I just updated the reference to match 1000G.
        # The AF columns in the legend file are the allele frequencies of the a1 allele in that file.
        # Need to match based on A1 alleles (hopefully this has already been done in the harmonization step, then
        # calculate the allele frequency difference.
        # For each population group, match the alleles where they aren't flipped (i.e. in the same order in house and
        # ref datasets..
        merged_file['AFR_Match_Diff'] = np.where((merged_file['dataset_a1'] == merged_file['reference_a1']) &
                                                 (merged_file['dataset_a2'] == merged_file['reference_a0']),
                                                 abs(merged_file['dataset_a1_frq'] - merged_file['AFR']), '')
        merged_file['AMR_Match_Diff'] = np.where((merged_file['dataset_a1'] == merged_file['reference_a1']) &
                                                 (merged_file['dataset_a2'] == merged_file['reference_a0']),
                                                 abs(merged_file['dataset_a1_frq'] - merged_file['AMR']), '')
        merged_file['EAS_Match_Diff'] = np.where((merged_file['dataset_a1'] == merged_file['reference_a1']) &
                                                 (merged_file['dataset_a2'] == merged_file['reference_a0']),
                                                 abs(merged_file['dataset_a1_frq'] - merged_file['EAS']), '')
        merged_file['EUR_Match_Diff'] = np.where((merged_file['dataset_a1'] == merged_file['reference_a1']) &
                                                 (merged_file['dataset_a2'] == merged_file['reference_a0']),
                                                 abs(merged_file['dataset_a1_frq'] - merged_file['EUR']), '')
        merged_file['SAS_Match_Diff'] = np.where((merged_file['dataset_a1'] == merged_file['reference_a1']) &
                                                 (merged_file['dataset_a2'] == merged_file['reference_a0']),
                                                 abs(merged_file['dataset_a1_frq'] - merged_file['SAS']), '')
        # For each population group, match the flipped alleles.
        merged_file['AFR_FlipMatch_Diff'] = np.where((merged_file['dataset_a1'] == merged_file['reference_a0']) &
                                                     (merged_file['dataset_a2'] == merged_file['reference_a1']),
                                                     abs(merged_file['dataset_a2_frq'] - merged_file['AFR']), '')
        merged_file['AMR_FlipMatch_Diff'] = np.where((merged_file['dataset_a1'] == merged_file['reference_a0']) &
                                                     (merged_file['dataset_a2'] == merged_file['reference_a1']),
                                                     abs(merged_file['dataset_a2_frq'] - merged_file['AMR']), '')
        merged_file['EAS_FlipMatch_Diff'] = np.where((merged_file['dataset_a1'] == merged_file['reference_a0']) &
                                                     (merged_file['dataset_a2'] == merged_file['reference_a1']),
                                                     abs(merged_file['dataset_a2_frq'] - merged_file['EAS']), '')
        merged_file['EUR_FlipMatch_Diff'] = np.where((merged_file['dataset_a1'] == merged_file['reference_a0']) &
                                                     (merged_file['dataset_a2'] == merged_file['reference_a1']),
                                                     abs(merged_file['dataset_a2_frq'] - merged_file['EUR']), '')
        merged_file['SAS_FlipMatch_Diff'] = np.where((merged_file['dataset_a1'] == merged_file['reference_a0']) &
                                                     (merged_file['dataset_a2'] == merged_file['reference_a1']),
                                                     abs(merged_file['dataset_a2_frq'] - merged_file['SAS']), '')
        # Add the nonflipped and flipped columns together to find out which files have an allele frequency
        # difference > 0.2
        merged_file['AFR_Diff'] = merged_file['AFR_Match_Diff'] + merged_file['AFR_FlipMatch_Diff']
        merged_file['AMR_Diff'] = merged_file['AMR_Match_Diff'] + merged_file['AMR_FlipMatch_Diff']
        merged_file['EAS_Diff'] = merged_file['EAS_Match_Diff'] + merged_file['EAS_FlipMatch_Diff']
        merged_file['EUR_Diff'] = merged_file['EUR_Match_Diff'] + merged_file['EUR_FlipMatch_Diff']
        merged_file['SAS_Diff'] = merged_file['SAS_Match_Diff'] + merged_file['SAS_FlipMatch_Diff']

        # Delete these columns because we don't need them anymore
        merged_file.drop(labels=['AFR_Match_Diff', 'AFR_FlipMatch_Diff', 'AMR_Match_Diff', 'AMR_FlipMatch_Diff',
                                 'EAS_Match_Diff', 'EAS_FlipMatch_Diff', 'EUR_Match_Diff', 'EUR_FlipMatch_Diff',
                                 'SAS_Match_Diff', 'SAS_FlipMatch_Diff'], axis=1, inplace=True)

        # Make allele freuqency columns numeric.
        merged_file[['AFR_Diff', 'AMR_Diff', 'EAS_Diff', 'EUR_Diff', 'SAS_Diff']] = \
            merged_file[['AFR_Diff', 'AMR_Diff', 'EAS_Diff', 'EUR_Diff', 'SAS_Diff']].apply(pd.to_numeric,
                                                                                            errors='coerce')

        # Make new column 'AF_Decision' where you remove alleles that have allele frequency differences > 0.2 from all
        # population groups.
        merged_file['AF_Decision'] = np.where((merged_file['AFR_Diff'] > 0.2) & (merged_file['AMR_Diff'] > 0.2) &
                                              (merged_file['EAS_Diff'] > 0.2) & (merged_file['EUR_Diff'] > 0.2) &
                                              (merged_file['SAS_Diff'] > 0.2), 'Remove', 'Keep')

        # Drop duplicate SNPs
        merged_file.drop_duplicates(subset=['SNP'], keep=False, inplace=True)

        # Write file for each chromosome of the SNPs that we've removed in this step.
        af_diff_removed_by_chr[i] = merged_file[merged_file['AF_Decision'] == 'Remove']

        # Write list for each chromosome of final SNPs that we are keeping.
        final_snps_by_chr[i] = merged_file[merged_file['AF_Decision'] == 'Keep']
        # Write list for each chromosome, because we're going to use it to filter the chromosomes to create new files.
        final_snps_by_chr[i]['SNP'].to_csv(final_snp_lists[i], sep='\t', header=False, index=False)

        # Make plink files for each chromosomes. Need bed file for merging
        subprocess.check_output([plink, '--bfile', harmonized_geno_names[i], '--extract', final_snp_lists[i],
                                 '--make-bed', '--out', af_checked_names[i]])

        # Remove extra files that we don't need anymore. These were files that were harmonized, but not checked for
        # allele frequency differences.
        if os.path.getsize(af_checked_names[i] + '.bim') > 0:
            os.remove(final_snp_lists[i])
            subprocess.call(rm + harmonized_geno_names[i] + '.*', shell=True)

        # Done with one chromosome.

        print('Finished with chr' + str(i + 1))

    # Make a big list of all SNPs removed and all SNPs kept just for reference purposes.
    all_snps_removed = pd.concat([af_diff_removed_by_chr[0], af_diff_removed_by_chr[1], af_diff_removed_by_chr[2],
                                  af_diff_removed_by_chr[3], af_diff_removed_by_chr[4], af_diff_removed_by_chr[5],
                                  af_diff_removed_by_chr[6], af_diff_removed_by_chr[7], af_diff_removed_by_chr[8],
                                  af_diff_removed_by_chr[9], af_diff_removed_by_chr[10], af_diff_removed_by_chr[11],
                                  af_diff_removed_by_chr[12], af_diff_removed_by_chr[13], af_diff_removed_by_chr[14],
                                  af_diff_removed_by_chr[15], af_diff_removed_by_chr[16], af_diff_removed_by_chr[17],
                                  af_diff_removed_by_chr[18], af_diff_removed_by_chr[19], af_diff_removed_by_chr[20],
                                  af_diff_removed_by_chr[21], af_diff_removed_by_chr[22]])
    # Write list to text file.
    all_snps_removed.to_csv('SNPs_Removed_AFCheck.txt', sep='\t', header=True, index=False)

    # Make one big list of all SNPs kept
    all_snps_kept = pd.concat([final_snps_by_chr[0], final_snps_by_chr[1], final_snps_by_chr[2],
                               final_snps_by_chr[3], final_snps_by_chr[4], final_snps_by_chr[5],
                               final_snps_by_chr[6], final_snps_by_chr[7], final_snps_by_chr[8],
                               final_snps_by_chr[9], final_snps_by_chr[10], final_snps_by_chr[11],
                               final_snps_by_chr[12], final_snps_by_chr[13], final_snps_by_chr[14],
                               final_snps_by_chr[15], final_snps_by_chr[16], final_snps_by_chr[17],
                               final_snps_by_chr[18], final_snps_by_chr[19], final_snps_by_chr[20],
                               final_snps_by_chr[21], final_snps_by_chr[22]])
    # Write this list to a text file.
    all_snps_kept.to_csv('SNPs_Kept_AFCheck.txt', sep='\t', header=True, index=False)

    # Merge harmonized dataset genotypes
    with open("HouseMergeList.txt", "w") as f:
        wr = csv.writer(f, delimiter="\n")
        wr.writerow(af_checked_names)
    subprocess.check_output([plink, '--merge-list', 'HouseMergeList.txt', '--geno', '0.01', '--make-bed', '--out',
                             geno_name + '_HarmonizedTo1000G'])

    if os.path.getsize(geno_name + '_HarmonizedTo1000G.bim') > 0:
        for i in range(0, len(af_checked_names)):
            subprocess.call(rm + str(af_checked_names[i]) + '.*', shell=True)

    else:
        print(Fore.RED + Style.BRIGHT)
        sys.exit("For some reason the house gentoypes did not merge. You should try it manually. Then, you will need "
                 "to use the program snpflip to make sure the snps are on the same strand as the reference.")

    # Check to make sure the snps are on the same strand as the reference
    # First need to change the chromosome names to match the fasta file so they can match.
    # Read chrX file into pandas
    bim_file = pd.read_csv(geno_name + '_HarmonizedTo1000G.bim', sep='\t', header=None,
                           dtype={0: str, 1: str, 2: int, 3: int, 4: str, 5: str})
    # Replace '23' with 'X', which is how the fasta file calls X
    bim_file.iloc[:, 0].replace('23', 'X', inplace=True)
    # Replace '24' with 'Y', which is how the fasta file calls Y
    bim_file.iloc[:, 0].replace('24', 'Y', inplace=True)
    # Replace '26' with 'MT' which is how the fasta file calls the mitochondrial DNA
    bim_file.iloc[:, 0].replace('26', 'MT', inplace=True)
    # Write new genotype
    bim_file.to_csv(geno_name + '_HarmonizedTo1000G.bim', sep='\t', header=False, index=False, na_rep='NA')

    # Fasta file needs to be unzipped for snpflip to work.
    if os.path.exists(os.path.join(fasta_path, 'human_g1k_v37.fasta')):
        pass
    elif os.path.exists(os.path.join(fasta_path, 'human_g1k_v37.fasta.gz')):
        try:
            with gzip.open(os.path.join(fasta_path, 'human_g1k_v37.fasta.gz'), 'rb') as f_in, \
                    open(os.path.join(fasta_path, 'human_g1k_v37.fasta'), 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        except:
            if system_check in ("Linux", "Darwin"):
                os.system('gunzip -c ' + os.path.join(fasta_path, 'human_g1k_v37.fasta.gz') + ' > '
                          + os.path.join(fasta_path, 'human_g1k_v37.fasta'))
            elif system_check == "Windows":
                zip_path = []
                for r, d, f in os.walk(os.path.join('C:\\', 'Program Files')):
                    for files in f:
                        if files == "7zG.exe":
                            zip_path = os.path.join(r, files)
                subprocess.check_output([zip_path, 'e', os.path.join(fasta_path, 'human_gik_v37.fasta.gz')])
    else:
        sys.exit("Quitting because I cannot find the fasta file. You must have this for snpflip to run.")

    # Determine if they have snpflip, if not download it.
    if os.path.exists(os.path.join(bindir, 'snpflip')):
        pass
    else:
        import genodownload
        genodownload.snpflip()

    snpflip_path = [os.path.join(bindir, 'snpflip')]
    # Perform flip check.
    subprocess.check_output('python ' + snpflip_path[0] + ' --fasta-genome "'
                            + os.path.join(fasta_path, 'human_g1k_v37.fasta')
                            + '" --bim-file ' + geno_name + '_HarmonizedTo1000G.bim --output-prefix ' + geno_name
                            + '_HarmonizedTo1000G', shell=True)

    # If SNPs exist that are on the reverse strand, then flip them.
    # Currently ignores snps that are ambiguous, since I already removed those that would be hard to phase. Could change
    # this later.
    if os.path.getsize(geno_name + '_HarmonizedTo1000G.reverse') > 0:
        subprocess.check_output([plink, '--bfile', geno_name + '_HarmonizedTo1000G', '--flip',
                                 geno_name + '_HarmonizedTo1000G.reverse', '--make-bed', '--out',
                                 geno_name + '_HarmonizedTo1000G_StrandChecked'])
    # If .reverse doesn't exist, still make a new file to signify that you've done the strand check.
    else:
        subprocess.check_output([plink, '--bfile', geno_name + '_HarmonizedTo1000G', '--make-bed', '--out',
                                 geno_name + '_HarmonizedTo1000G_StrandChecked'])

    # Finished
    shutil.copy2(geno_name + '_HarmonizedTo1000G_StrandChecked.bed', orig_wd)
    shutil.copy2(geno_name + '_HarmonizedTo1000G_StrandChecked.bim', orig_wd)
    shutil.copy2(geno_name + '_HarmonizedTo1000G_StrandChecked.fam', orig_wd)

    print("Finished with harmonization")

Esempio n. 2

Mostra file

def merge1000g(harmonized_name, harmonized_path):
    import os
    import sys
    import subprocess

    # Get original working directory
    orig_wd = os.getcwd()

    # Make sure user has harmonized first.
    print(Fore.BLUE + Style.BRIGHT)
    merge_proceed = input("You must harmonize your data with 1000G before this step. Have you already done this? "
                          "(y/n): ").lower()
    print(Style.RESET_ALL)

    if merge_proceed in ("y", "yes"):
        import csv
        import shutil
        import glob

        try:
            import pandas as pd
        except (ImportError, ModuleNotFoundError):
            import genodownload
            genodownload.getpandas()
            import pandas as pd

        # Create new directory for storing these files.
        if not os.path.exists('Merged_With_1000G'):
            os.makedirs('Merged_With_1000G')

        # Copy plink to new folder.
        if not glob.glob(r'plink*'):
            sys.exit("We need plink to run this part of the script. Please run step 1 if you do not have plink, or "
                     "make sure that it is in this directory.")
        else:
            for file in glob.glob(r'plink*'):
                print(file)
                shutil.copy(file, 'Merged_With_1000G')

        # Making sure they still have the 1000G vcf files. They should if they have just harmonized, but they might
        # have deleted them or something.
        print(Fore.MAGENTA + Style.BRIGHT)
        vcf_exists = input('Have you already downloaded the 1000G Phase3 VCF files? (y/n): ').lower()
        print(Style.RESET_ALL)

        if vcf_exists in ('y', 'yes'):
            # Getting user's path to VCF files
            print(Fore.GREEN)
            vcf_path = input('Please enter the pathname of where your 1000G vcf files are '
                             '(i.e. C:\\Users\\Julie White\\Box Sync\\1000GP\\VCF\\ etc.): ')
            print(Style.RESET_ALL)

        elif vcf_exists in ('n', 'no'):
            # Get module where downloading instructions are.
            import genodownload
            # From that module, call download 1000G Phase 3 VCF
            genodownload.vcf_1000g_phase3()
            # Saving VCF path
            vcf_path = os.path.join(os.getcwd(), '1000G_Phase3_VCF')
        else:
            sys.exit("Please answer yes or no. Quitting now because no VCF files.")

        # Names of per chromosome files that we're going to merge into one big file.
        ref_file_names = ['ALL.chr%d.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz' % x for x in
                          range(1, 23)]
        ref_file_names.extend(['ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz'])
        chr_1000g_phase3_names = ['chr%d_1000G_Phase3' % x for x in range(1, 24)]

        # Merge 1000G chr data into one plink formatted file, need to convert from vcf files - but only taking the snps
        # that are in the house dataset
        # Read in SNPs_Kept file from harmonization process
        if os.path.exists(os.path.join(harmonized_path, 'SNPs_Kept.txt')):
            house_snps_kept = pd.read_csv(os.path.join(harmonized_path, 'SNPs_Kept.txt'), header=0, sep='\t')
            # Keep only the 'SNP' column
            house_snps_kept = house_snps_kept.loc[:, ['SNP']]
            # Write that column to a file to be used by plink
            house_snps_kept.to_csv('Merged_With_1000G/SNPs_Kept_List.txt', sep='\t', header=False, index=False)
        else:
            sys.exit("Quitting because I cannot find a file called 'SNPs_Kept.txt' at "
                     + harmonized_path + ". This is a product of the harmonization process and is necessary for "
                                         "merging with 1000G.")

        # Change to directory where we're going to merge the files.
        os.chdir('Merged_With_1000G')

        # Convert vcf files to plink format.
        for i in range(0, len(ref_file_names)):
            subprocess.check_output([plink, '--vcf', os.path.join(vcf_path, ref_file_names[i]), '--double-id',
                                     '--biallelic-only', 'strict', '--vcf-require-gt', '--extract',
                                     'SNPs_Kept_List.txt', '--make-bed', '--out', chr_1000g_phase3_names[i]])
        subprocess.call(rm + '*~', shell=True)

        # Create list of files to be merged into one large file.
        with open("1000GMergeList.txt", "w") as f:
            wr = csv.writer(f, delimiter="\n")
            wr.writerow(chr_1000g_phase3_names)

        # Use plink to merge those files into one large file.
        subprocess.check_output([plink, '--merge-list', '1000GMergeList.txt', '--geno', '0.01', '--make-bed', '--out',
                                 '1000G_Phase3'])

        # Read in log file from merge.
        logfile = pd.DataFrame()
        # Format the log file into a pandas dataframe.
        with open('1000G_Phase3.log', 'r') as f:
            for line in f:
                logfile = pd.concat([logfile, pd.DataFrame([tuple(line.strip().split(" "))])], ignore_index=True)

        # If the logfile contains warnings, write a text file '1000G_MergeWarnings.txt' with the SNPs that threw
        # warnings.
        rsid_warnings = []
        if logfile[0].str.contains('Warning:').any():
            rsid_warnings = logfile.loc[logfile[0] == 'Warning:', 6].str.split("'", expand=True)
            rsid_warnings[1].dropna(how='any').to_csv('1000G_MergeWarnings.txt', sep='\t', header=False, index=False)

        # If the text file 1000G_MergeWarnings exists...
        if os.path.exists('1000G_MergeWarnings.txt'):
            # If merge warnings and missnps exist, exclude both from 1000G completely (there are plenty of other snps)
            if os.path.exists('1000G_Phase3-merge.missnp'):
                # Read in missnp file
                missnp = pd.read_csv('1000G_Phase3-merge.missnp', sep='\t', header=None)
                # Merge the warning snps with the missnps
                warnings_missnp = pd.concat([rsid_warnings, missnp], axis=0)
                # Drop duplicates and write to a file to be used in plink.
                warnings_missnp[1].dropna(how='any').to_csv('1000G_warnings_missnp.txt', sep='\t',
                                                            header=False, index=False)
                # Remove these snps from plink files and create new plink files.
                for i in range(0, len(chr_1000g_phase3_names)):
                    subprocess.check_output([plink, '--bfile', chr_1000g_phase3_names[i], '--exclude',
                                             '1000G_warnings_missnp.txt', '--geno', '0.01', '--make-bed', '--out',
                                             chr_1000g_phase3_names[i]])
                # Remove old plink files.
                subprocess.call(rm + '*~', shell=True)
                # Retry the merge
                subprocess.check_output([plink, '--merge-list', '1000GMergeList.txt', '--geno', '0.01', '--make-bed',
                                         '--out', '1000G_Phase3'])
                # The merge should be successful this time, but the user should double check.
                print("Successfully merged 1000G, though you should double-check the log file to be sure.")

            else:  # If only merge warnings exist, exclude from 1000G completely
                # Use plink to exclude the merge warning snps.
                for i in range(0, len(chr_1000g_phase3_names)):
                    subprocess.check_output([plink, '--bfile', chr_1000g_phase3_names[i], '--exclude',
                                             '1000G_MergeWarnings.txt', '--geno', '0.01', '--make-bed', '--out',
                                             chr_1000g_phase3_names[i]])
                # Remove old plink files.
                subprocess.call(rm + '*~', shell=True)
                # Try merge again.
                subprocess.check_output([plink, '--merge-list', '1000GMergeList.txt', '--geno', '0.01', '--make-bed',
                                         '--out', '1000G_Phase3'])
                # Merge should be successful this time, but the user should double check.
                print("Successfully merged 1000G, though you should double-check the log file to be sure.")

        # If only the missnps exist, remove them in 1000G.
        elif os.path.exists('1000G_Phase3-merge.missnp') and os.path.getsize('1000G_MergeWarnings.txt') == 0:
            # Use plink to remove the missnps
            for i in range(0, len(chr_1000g_phase3_names)):
                subprocess.check_output([plink, '--bfile', chr_1000g_phase3_names[i], '--exclude',
                                         '1000G_Phase3-merge.missnp', '--geno', '0.01', '--make-bed', '--out',
                                         chr_1000g_phase3_names[i]])
            # Remove old plink files
            subprocess.call(rm + '*~', shell=True)
            # Retry the merge
            subprocess.check_output([plink, '--merge-list', '1000GMergeList.txt', '--geno', '0.01', '--make-bed',
                                     '--out', '1000G_Phase3'])
            # Merge should be successful this time, but the user should double check.
            print("Successfully merged 1000G, though you should double check the log file to be sure.")

        elif os.path.exists('1000G_Phase3.bim'):
            # Merge was successful.
            print("Successfully merged 1000G.")
            # Remove the per chromosome 1000G files, since they're just taking up space now.
            for i in range(0, len(chr_1000g_phase3_names)):
                subprocess.call(rm + chr_1000g_phase3_names[i] + '.*', shell=True)

        else:  # If merge didn't work for reasons other than merge warnings and missnps.
            print(Fore.RED + Style.BRIGHT)
            sys.exit("Unable to merge 1000G chromosome files. You should try to merge them on your own.")

        # Merge of house data and 1000G #
        # Copying harmonized to 1000G files to this folder.
        shutil.copy2(os.path.join(orig_wd, harmonized_name + '.bed'), os.getcwd())
        shutil.copy2(os.path.join(orig_wd, harmonized_name + '.bim'), os.getcwd())
        shutil.copy2(os.path.join(orig_wd, harmonized_name + '.fam'), os.getcwd())

        # Perform initial merge
        subprocess.check_output([plink, '--bfile', harmonized_name, '--bmerge', '1000G_Phase3', '--geno', '0.01',
                                 '--make-bed', '--out', harmonized_name + '_1000G'])

        # Read in log file to see if anything went wrong
        logfile = pd.DataFrame()
        # Concatenate log file to pandas dataframe.
        with open(harmonized_name + '_1000G.log', 'r') as f:
            for line in f:
                logfile = pd.concat([logfile, pd.DataFrame([tuple(line.strip().split(" "))])], ignore_index=True)

        # Check to see if the logfile has warnings
        if logfile[0].str.contains('Warning:').any():
            # Identify the snps that made the warning.
            rsid_warnings = logfile.loc[logfile[0] == 'Warning:', 6].str.split("'", expand=True)
            # Put those SNPs in a file so we can remove them.
            rsid_warnings[1].dropna(how='any').to_csv(harmonized_name + '_1000G_MergeWarnings.txt', sep='\t',
                                                      header=False, index=False)
        # If there are merge warnings
        if os.path.exists(harmonized_name + '_1000G_MergeWarnings.txt'):
            # and if there are triallelic snps that need to be flipped (missnp)
            if os.path.exists(harmonized_name + '_1000G-merge.missnp'):
                # If merge warnings and missnps exist, exclude warning snps from both 1000G and house dataset
                subprocess.check_output([plink, '--bfile', '1000G_Phase3', '--exclude',
                                         harmonized_name + '_1000G_MergeWarnings.txt', '--geno', '0.01', '--make-bed',
                                         '--out', '1000G_Phase3'])
                # Flip missnps in house dataset.
                subprocess.check_output([plink, '--bfile', harmonized_name, '--exclude',
                                         harmonized_name + '_1000G_MergeWarnings.txt', '--flip',
                                         harmonized_name + '_1000G-merge.missnp', '--geno', '0.01', '--make-bed',
                                         '--out', harmonized_name])
                # Remove old files.
                subprocess.call(rm + '*~', shell=True)
                # Retry merge.
                subprocess.check_output([plink, '--bfile', harmonized_name, '--bmerge', '1000G_Phase3', '--geno',
                                         '0.01', '--make-bed', '--out', harmonized_name + '_1000G_merge2'])

            else:  # If only mergewarnings exists, exclude warning snps from both 1000G and house dataset.
                subprocess.check_output([plink, '--bfile', '1000G_Phase3', '--exclude',
                                         harmonized_name + '_1000G_MergeWarnings.txt', '--geno', '0.01', '--make-bed',
                                         '--out', '1000G_Phase3'])
                # Exclude warning snps from house dataset.
                subprocess.check_output([plink, '--bfile', harmonized_name, '--exclude',
                                         harmonized_name + '_1000G_MergeWarnings.txt', '--geno', '0.01', '--make-bed',
                                         '--out', harmonized_name])
                # Remove old plink files.
                subprocess.call(rm + '*~', shell=True)
                # Retry merge
                subprocess.check_output([plink, '--bfile', harmonized_name, '--bmerge', '1000G_Phase3', '--geno',
                                         '0.01', '--make-bed', '--out', harmonized_name + '_1000G_merge2'])
        # If only the missnps exist, flip them in the house dataset.
        elif os.path.exists(harmonized_name + '_1000G-merge.missnp') \
                and not os.path.exists(harmonized_name + '_1000G_MergeWarnings.txt'):
            # Flip in house dataset.
            subprocess.check_output([plink, '--bfile', harmonized_name, '--flip',
                                     harmonized_name + '_1000G-merge.missnp', '--geno', '0.01', '--make-bed', '--out',
                                     harmonized_name])
            # Remove old plink files.
            subprocess.call(rm + '*~', shell=True)
            # Retry the merge.
            subprocess.check_output([plink, '--bfile', harmonized_name, '--bmerge', '1000G_Phase3', '--geno', '0.01',
                                     '--make-bed', '--out', harmonized_name + '_1000G_merge2'])

        elif os.path.exists(harmonized_name + '_1000G.bim'):
            # If mergewarnings and missnips don't exist, then hopefully the merge happened successfully on the first
            # try.
            print("Successfully merged house dataset with 1000G on the first try, though you should double check. I "
                  "can't predict every error.")
            # Copy successfully merged files to original working directory.
            shutil.copy2(harmonized_name + '_1000G.bed', orig_wd)
            shutil.copy2(harmonized_name + '_1000G.bim', orig_wd)
            shutil.copy2(harmonized_name + '_1000G.fam', orig_wd)
            shutil.copy2(harmonized_name + '_1000G.log', orig_wd)

            # Change back to original working directory.
            os.chdir(orig_wd)

        else:
            print(Fore.RED + Style.BRIGHT)
            sys.exit("The house dataset did not merge properly with 1000G, but not because of SNP merge "
                     "warnings or SNPs that needed to be flipped. I'm sorry, you'll have to perform the merge on your "
                     "own.")

        # If we had to perform a second merge because of one of the reasons above.
        if os.path.exists(harmonized_name + '_1000G_merge2.log'):
            # Read in log file
            logfile = pd.DataFrame()
            # Convert log file to pandas dataframe
            with open(harmonized_name + '_1000G_merge2.log', 'r') as f:
                for line in f:
                    logfile = pd.concat([logfile, pd.DataFrame([tuple(line.strip().split(" "))])], ignore_index=True)

            # Check to see if the logfile has warnings
            if logfile[0].str.contains('Warning:').any():
                # Figure out what snps caused the warnings.
                rsid_warnings = logfile.loc[logfile[0] == 'Warning:', 6].str.split("'", expand=True)
                # Write these warnings to a text file for plink to use.
                rsid_warnings[1].dropna(how='any').to_csv(harmonized_name + '_1000G_merge2_warnings.txt', sep='\t',
                                                          header=False,
                                                          index=False)
            # If warnings exist
            if os.path.exists(harmonized_name + '_1000G_merge2_warnings.txt'):
                # And if there are still triallelic snps
                if os.path.exists(harmonized_name + '_1000G_merge2-merge.missnp'):
                    # Import missnp file to pandas
                    missnp = pd.read_csv(harmonized_name + '_1000G_merge2-merge.missnp', sep='\t', header=None)
                    # Merge this with merge warnings snps
                    warnings_missnp = pd.concat([rsid_warnings, missnp], axis=0)
                    # Write to text file for plink to use.
                    warnings_missnp[1].dropna(how='any').to_csv(harmonized_name + '_1000G_merge2_warnings_missnp.txt',
                                                                sep='\t', header=False, index=False)
                    # Remove all of these snps from 1000G dataset.
                    subprocess.check_output([plink, '--bfile', '1000G_Phase3', '--exclude',
                                             harmonized_name + '_1000G_merge2_warnings_missnp.txt', '--geno', '0.01',
                                             '--make-bed', '--out', '1000G_Phase3'])
                    # Remove all of these snps from house dataset.
                    subprocess.check_output([plink, '--bfile', harmonized_name, '--exclude',
                                             harmonized_name + '_1000G_merge2_warnings_missnp.txt', '--geno', '0.01',
                                             '--make-bed', '--out', harmonized_name])
                    # Remove old plink files.
                    subprocess.call(rm + '*~', shell=True)
                    # Try merge a third time.
                    subprocess.check_output([plink, '--bfile', harmonized_name, '--bmerge', '1000G_Phase3', '--geno',
                                             '0.01', '--make-bed', '--out', harmonized_name + '_1000G_merge3'])

                else:  # If only merge warnings still exist, exclude from both 1000G and house dataset.
                    # Exclude from 1000G dataset.
                    subprocess.check_output([plink, '--bfile', '1000G_Phase3', '--exclude',
                                             harmonized_name + '_1000G_merge2_warnings.txt', '--geno', '0.01',
                                             '--make-bed', '--out', '1000G_Phase3'])
                    # Exclude from house dataset
                    subprocess.check_output([plink, '--bfile', harmonized_name, '--exclude',
                                             harmonized_name + '_1000G_merge2_warnings.txt', '--geno', '0.01',
                                             '--make-bed', '--out', harmonized_name])
                    # Remove old plink files.
                    subprocess.call(rm + '*~', shell=True)
                    # Retry merge a third time.
                    subprocess.check_output([plink, '--bfile', harmonized_name, '--bmerge', '1000G_Phase3', '--geno',
                                             '0.01', '--make-bed', '--out', harmonized_name + '_1000G_merge3'])
            # If only the missnps still exist, remove them in both datasets.
            elif os.path.exists(harmonized_name + '_1000G_merge2-merge.missnp') \
                    and not os.path.exists(harmonized_name + '_1000G_merge2_warnings.txt'):
                # Exclude from house dataset.
                subprocess.check_output([plink, '--bfile', harmonized_name, '--exclude',
                                         harmonized_name + '_1000G_merge2-merge.missnp', '--geno', '0.01', '--make-bed',
                                         '--out', harmonized_name])
                # Exclude from 1000G dataset.
                subprocess.check_output([plink, '--bfile', '1000G_Phase3', '--exclude',
                                         harmonized_name + '_1000G_merge2-merge.missnp', '--geno', '0.01', '--make-bed',
                                         '--out', '1000G_Phase3'])
                # Remove old files.
                subprocess.call(rm + '*~', shell=True)
                # Retry merge a third time.
                subprocess.check_output([plink, '--bfile', harmonized_name, '--bmerge', '1000G_Phase3', '--geno',
                                         '0.01', '--make-bed', '--out', harmonized_name + '_1000G_merge3'])
            # If we don't find warnings or missnps
            elif os.path.exists(harmonized_name + '_1000G_merge2.bim'):
                # Merge should have happened, but user should check.
                print("Successfully merged house dataset with 1000G on 2nd try, though you should double check. I "
                      "can't predict every error.")
                # Copy merged files to original working directory
                shutil.copy2(harmonized_name + '_1000G_merge2.bed', orig_wd)
                shutil.copy2(harmonized_name + '_1000G_merge2.bim', orig_wd)
                shutil.copy2(harmonized_name + '_1000G_merge2.fam', orig_wd)
                shutil.copy2(harmonized_name + '_1000G_merge2.log', orig_wd)

                # Change back to original working directory.
                os.chdir(orig_wd)
            # If the merge didn't happen, but not because of merge warnings or missnps.
            else:
                print(Fore.RED + Style.BRIGHT)
                sys.exit("House dataset and 1000G did not successfully merge 2nd time, though not because of merge "
                         "warnings or SNPs that needed to be flipped. You'll have to perform the merge on your own. "
                         "I'm sorry!")

        if os.path.exists(harmonized_name + '_1000G_merge3.log'):
            # If merge 3 log exists, read it into pandas dataframe.
            logfile = pd.DataFrame()
            with open(harmonized_name + '_1000G_merge3.log', 'r') as f:
                for line in f:
                    logfile = pd.concat([logfile, pd.DataFrame([tuple(line.strip().split(" "))])], ignore_index=True)

            # If the bim file doesn't exist, try to figure out why.
            if not os.path.exists(harmonized_name + '_1000G_merge3.bim'):
                # If logfile still has warnings, or there are still missnps, the user will need to identify them and
                # take care of them manually.
                if logfile[0].str.contains('Warning:').any() \
                        and os.path.exists(harmonized_name + '_1000G_merge3-merge.missnp'):
                    print(Fore.RED + Style.BRIGHT)
                    sys.exit("I'm sorry, the logfile still has warnings, even after removing snps that "
                             "threw errors in the first two tries. There are also still snps with 3+ variants present, "
                             "even after flipping some and removing the ones that the flip didn't solve. You'll have "
                             "to manually deal with these using the merge3 log and the merge3-merge.missnp file.")
                # Check to see if the logfile still has warnings
                elif logfile[0].str.contains('Warning:').any():
                    print(Fore.RED + Style.BRIGHT)
                    sys.exit("I'm sorry, the logfile still has warnings, even after removing snps that threw errors in "
                             "the first two tries. You'll have to manually deal with these using the merge3 log file.")
                # If triallelic snps still exist
                elif os.path.exists(harmonized_name + '_1000G_merge3-merge.missnp'):
                    print(Fore.RED + Style.BRIGHT)
                    sys.exit("I'm sorry, there are still snps with 3+ variants present, even after flipping some and "
                             "removing the ones that the flip didn't solve. You'll have to deal with these manually "
                             "using the merge3-merge.missnp file")

            # If the bim file exists, then the merge should have happened correctly.
            if os.path.exists(harmonized_name + '_1000G_merge3.bim'):
                print("House dataset and 1000G merged correctly on 3rd try. Though you should double-check the log "
                      "file, I can't predict every error.")
                # Copy files to original working directory.
                shutil.copy2(harmonized_name + '_1000G_merge3.bed', orig_wd)
                shutil.copy2(harmonized_name + '_1000G_merge3.bim', orig_wd)
                shutil.copy2(harmonized_name + '_1000G_merge3.fam', orig_wd)
                shutil.copy2(harmonized_name + '_1000G_merge3.log', orig_wd)

                # Change back to original working directory.
                os.chdir(orig_wd)

    # End the program if the user did not harmonize first.
    elif merge_proceed in ('n', 'no'):
        sys.exit("Please harmonize your genotypes with 1000G first.")

    # End the program if the user did not give a correctly formatted answer.
    else:
        sys.exit('Please answer yes or no.')

Esempio n. 3

Mostra file

File: genoqc.py Progetto: ruixiangliu/GenotypeProcessing

def het(geno_name):
    # Identifies individuals with extreme heterozygosity values (more than +- 3 SD)
    # Getting extra required modules
    try:
        import pandas as pd
    except (ImportError, ModuleNotFoundError):
        import genodownload
        genodownload.getpandas()
        import pandas as pd

    try:
        import numpy as np
    except (ImportError, ModuleNotFoundError):
        import genodownload
        genodownload.getnumpy()
        import numpy as np

    import subprocess

    # Use plink to calculate the heterozygosity, paying attention to geno and mind.
    subprocess.check_output([
        plink, '--bfile', geno_name, '--geno', '0.1', '--mind', '0.1', '--het',
        '--out', geno_name
    ])

    # Read het file into pandas
    het_file = pd.read_csv(geno_name + '.het', sep='\s+', header=0)

    # Create new column with formula: (N(NM)-O(HOM))/N(NM)
    het_file['HET'] = (het_file['N(NM)'] -
                       het_file['O(HOM)']) / het_file['N(NM)']

    # Getting standard deviation and average of HET column
    het_sd = np.std(het_file['HET'])
    het_avg = np.mean(het_file['HET'])

    # Add label 'keep' to people within 3*SD of the average het value, give 'remove' to everyone else.
    het_file['HET_Filter'] = np.where(
        (het_file['HET'] > het_avg - 3 * het_sd) &
        (het_file['HET'] < het_avg + 3 * het_sd), 'Keep', 'Remove')
    # Write this file so the user has it.
    het_file.to_csv(geno_name + '.het', sep='\t', header=True, index=False)
    # Make a list of the people who pass the filter.
    het_keep = het_file[het_file['HET_Filter'] == 'Keep']
    # Write this file so that we can use it later to filter people out.
    het_keep[['FID', 'IID']].to_csv(geno_name + '_KeptAfterHetCheck.txt',
                                    sep='\t',
                                    header=False,
                                    index=False)
    # Make a list of the people who fail the filter.
    het_rem = het_file[het_file['HET_Filter'] == 'Remove']
    # Write this to file so we have the record.
    het_rem.to_csv(geno_name + '_RemAfterHetCheck.txt',
                   sep='\t',
                   header=True,
                   index=False)

    # Make new plink file with people passing het check.
    subprocess.check_output([
        plink, '--bfile', geno_name, '--keep',
        geno_name + '_KeptAfterHetCheck.txt', '--geno', '0.1', '--make-bed',
        '--out', geno_name + '_HetChecked'
    ])

    print(
        "Done. Your new file of people with non-extreme heterozygosity values will be called "
        + geno_name + "_HetChecked")

Esempio n. 4

Mostra file

File: harmonize_postprocess.py Progetto: ruixiangliu/GenotypeProcessing

import gzip
import shutil
from os.path import expanduser

try:
    import argparse
except (ImportError, ModuleNotFoundError):
    import genodownload
    genodownload.getargparse()
    import argparse

try:
    import pandas as pd
except (ImportError, ModuleNotFoundError):
    import genodownload
    genodownload.getpandas()
    import pandas as pd

try:
    import numpy as np
except (ImportError, ModuleNotFoundError):
    import genodownload
    genodownload.getnumpy()
    import numpy as np

try:
    import colorama
    from colorama import init, Fore, Style
    init()
except ImportError:
    import genodownload