Ejemplo n.º 1
0
def annotateVCFs(vcflistpath=bsmutils.get_bsmdir() + '/results/calls/filtered-vcfs.tsv',
        vcfdir=bsmutils.get_bsmdir() + '/results/calls/'):
    vcflist = pd.read_csv(vcflistpath, sep='\t', names=['sample', 'file'], index_col='sample')
    def helper(sample):
        invcf = vcfdir + os.path.sep + 'filtered' + os.path.sep + vcflist.loc[sample, 'file']
        targetdir = vcfdir + os.path.sep + 'annotated' + os.path.sep
        val = annotateVCF(invcf=invcf, sample=sample, targetdir=targetdir)
        return(val)
    pp = [helper(y) for y in vcflist.index]
    return(pp)
Ejemplo n.º 2
0
def annotateVCF(invcf=bsmutils.get_bsmdir() + '/results/calls/filtered/MSSM_106_brain.ploidy_50.filtered.vcf',
        sample='MSSM_106_NeuN_pl',
        targetdir=bsmutils.get_bsmdir() + '/results/calls/annotated/'):
    '''
    Some help would be nice
    '''
    script = bsmutils.get_bsmdir() + '/src/annotate-vcf-bsm'
    cmd = [script, '-t', targetdir, invcf, sample]
    p =  subprocess.run(cmd, capture_output=True)
    return(p)
Ejemplo n.º 3
0
def get_multi_annotations(annotlist,
                          vcflistpath=bsmutils.get_bsmdir() +
                          '/results/calls/filtered-vcfs-Chess-Walsh.tsv',
                          annotdirpath=bsmutils.get_bsmdir() +
                          '/results/2020-09-07-annotations',
                          na_values={},
                          simplecolumns=True):
    vcflist = pd.read_csv(vcflistpath,
                          sep='\t',
                          names=['sample', 'file'],
                          index_col='sample')
    samplestr = '((MSSM|PITT)_[0-9]+)_(NeuN_pl|NeuN_mn|muscle)'

    def sample2indivID(sample):
        return (re.sub(samplestr, 'CMC_\\1', sample))

    def sample2tissue(sample):
        #if re.match('.*Walsh.*', vcflistpath):
        if not re.match(samplestr, sample):
            return ('frontal cortex')  # Walsh data
        return (re.sub(samplestr, '\\3', sample))  # Chess data

    def get_annot(sample, annotyp):
        sampledir = annotdirpath + os.path.sep + sample
        tsvpath = sampledir + os.path.sep + annotyp + '.txt'
        indivID = sample2indivID(sample)
        tissue = sample2tissue(sample)
        na_val = na_values[annotyp] if annotyp in na_values.keys() else []
        try:
            annot = read_TXT_per_annotation(tsvpath,
                                            indivID,
                                            tissue,
                                            simplecolumns=simplecolumns,
                                            na_values=na_val)
            annot = annotation_duplicates(annot, sep=':')
        except ValueError:
            annot = None
        return (annot)

    def do_annotyp(annotyp):
        try:
            annot = pd.concat([get_annot(s, annotyp) for s in vcflist.index],
                              axis=0)
        except ValueError:
            annot = None
        return (annot)

    annot = pd.concat([do_annotyp(a) for a in annotlist], axis=1)
    return (annot)
Ejemplo n.º 4
0
def create_colsdict():
    '''
    Create input dictionary for regularize_categ_cols
    '''
    colsdict = {}
    # order reflecting severity of effect
    l = [
        'Deleterious', 'Deleterious - Low Confidence', 'Tolerated',
        'Tolerated - Low Confidence'
    ]
    colsdict.update({'sift_Prediction': l})
    # order reflecting increasing frequency of categories in the data set
    l = ['Polymerase', 'Open Chromatin', 'Transcription Factor', 'Histone']
    colsdict.update({'encode_Feature Type Class': l})
    l = [
        'intronic (splice_site)', 'coding', 'intronic', '5utr', '3utr',
        '5upstream', '3downstream', 'non-coding intronic', 'non-coding'
    ]
    colsdict.update({'ensembl_Predicted Function': l})

    def read_categories(fpath):
        with open(fpath) as f:
            val = f.readlines()
            val = [x.strip() for x in val]
            return (val)

    regbuild_epigenomes = read_categories(
        bsmutils.get_bsmdir() +
        '/results/2020-09-07-annotations/regbuild-epigenomes')
    colsdict.update({'regbuild_Epigenome': regbuild_epigenomes})
    colsdict.update({'structvar_Type': ['complex', 'loss', 'gain']})
    return (colsdict)
Ejemplo n.º 5
0
def load_data(picklepath=bsmutils.get_bsmdir() +
              '/results/2020-09-07-annotations/annotated-calls.p'):
    '''
    Load annotated calls from pickle file
    '''
    with open(picklepath, 'rb') as f:
        data = pickle.load(f)
    return (data)
Ejemplo n.º 6
0
def readVCFs(vcflistpath=bsmutils.get_bsmdir() + '/results/calls/filtered-vcfs.tsv',
        vcfdir=bsmutils.get_bsmdir() + '/results/calls/', clean=True):
    '''
    Reads the calls/records of several VCFs into rows of a single DataFrame

    Arguments
    vcflistpath: path to file listing all VCFs
    vcfdir: the directory of the VCFs
    clean: weather to remove redundant & degenerate columns

    Value:
    calls: a pandas DataFrame
    '''
    vcflist = pd.read_csv(vcflistpath, sep='\t', names=['sample', 'file'], index_col='sample')
    vcflist['filepath'] = [vcfdir + os.sep + 'annotated' + os.sep + f for f in vcflist['file']]
    l = [readVCF(y) for y in vcflist['filepath']]
    calls = pd.concat(l, axis=0)
    if clean:
        calls = clean_calls(calls, dropna=True, dropdegenerate=True, dropredundant=True)
    return(calls)
Ejemplo n.º 7
0
def do_annot(annotlist=annotlist,
             na_values=na_values,
             colsdict=create_colsdict(),
             fpath=bsmutils.get_bsmdir() +
             '/results/2020-09-07-annotations/annot.p',
             calls=individuals.get_datasets()):
    '''
    Main function: read SNPnexus annotations for the full Chess and Walsh datasets
    '''
    if os.path.exists(fpath):
        print('loading annot DataFrame from', fpath)
        with open(fpath, 'rb') as f:
            annot = pickle.load(f)
    else:
        vcflistpath = bsmutils.get_bsmdir(
        ) + '/results/calls/filtered-vcfs-Chess-Walsh.tsv'
        annotdirpath = bsmutils.get_bsmdir(
        ) + '/results/2020-09-07-annotations'
        annot = get_multi_annotations(annotlist, vcflistpath, annotdirpath,
                                      na_values)
        pickle.dump(annot, open(fpath, 'wb'))
    return (annot)
Ejemplo n.º 8
0
def read_annotlist(annotpath=bsmutils.get_bsmdir() + '/tables/VCF-HC.annotations', withFORMAT=False):
    '''
    Reads a file containing list of annotations in VCFs into a list.

    Parameters
    annotpath: the path to the aforementioned file
    withFORMAT: if False (default) the FORMAT fields are omitted

    Value: the list of annotations
    '''
    with open(annotpath) as f:
        l = f.readlines()
    l = [y.replace('\n', '') for y in l] # remove newline characters
    if not withFORMAT:
        l = [y for y in l if not re.match('^FORMAT', y)]
    return(l)
Ejemplo n.º 9
0
def read_clinical(ancestry=True):
    # CMC_Human_clinical_metadata.csv
    if not os.path.exists(cmc_clinical_path):
        import synapseclient
        syn = synapseclient.login()
        wdir = bsmutils.get_bsmdir() + '/resources/'
        clinical_syn = syn.get('syn2279441',
                               downloadLocation=wdir,
                               ifcollision='overwrite.local')
        fpath = clinical_syn.path
    else:
        fpath = cmc_clinical_path
    clinical = pd.read_csv(fpath, index_col='Individual ID')
    if ancestry:
        ancestry = pd.read_csv(cmc_ancestry_path,
                               sep='\t',
                               index_col='Individual_ID')
        ancestry = ancestry.drop(columns=['Genotyping_Sample_ID', 'Cluster'])
        clinical = pd.concat([clinical, ancestry], axis=1)
    #clinical.columns = pd.MultiIndex.from_product([['Clinical'], calls.columns], names=['Source', 'Annotation'])
    return (clinical)
Ejemplo n.º 10
0
        targetdir=bsmutils.get_bsmdir() + '/results/calls/annotated/'):
    '''
    Some help would be nice
    '''
    script = bsmutils.get_bsmdir() + '/src/annotate-vcf-bsm'
    cmd = [script, '-t', targetdir, invcf, sample]
    p =  subprocess.run(cmd, capture_output=True)
    return(p)

def annotateVCFs(vcflistpath=bsmutils.get_bsmdir() + '/results/calls/filtered-vcfs.tsv',
        vcfdir=bsmutils.get_bsmdir() + '/results/calls/'):
    vcflist = pd.read_csv(vcflistpath, sep='\t', names=['sample', 'file'], index_col='sample')
    def helper(sample):
        invcf = vcfdir + os.path.sep + 'filtered' + os.path.sep + vcflist.loc[sample, 'file']
        targetdir = vcfdir + os.path.sep + 'annotated' + os.path.sep
        val = annotateVCF(invcf=invcf, sample=sample, targetdir=targetdir)
        return(val)
    pp = [helper(y) for y in vcflist.index]
    return(pp)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--dir', help='main VCF directory (bsm/results/calls/)',
            default=bsmutils.get_bsmdir() + '/results/calls/')
    parser.add_argument('-l', '--vcflist', help='list of samples and VCF files (bsm/results/calls/filtered-vcfs.tsv)',
            default=bsmutils.get_bsmdir() + '/results/calls/filtered-vcfs.tsv')
    args = parser.parse_args()
    annotateVCFs(vcflistpath=args.vcflist, vcfdir=args.dir)
    readVCFs(vcflistpath=args.vcflist, vcfdir=args.dir)
Ejemplo n.º 11
0
import scipy.stats
import numpy as np
import pandas as pd
import os.path
from bsmcalls import readVCF
from bsmcalls import preprocessing
import bsmutils

cmc_clinical_synid = 'syn2279441'
cmc_clinical_path = bsmutils.get_bsmdir(
) + '/resources/CMC_Human_clinical_metadata.csv'
cmc_ancestry_path = bsmutils.get_bsmdir(
) + '/resources/cmc-ancestry/CMC_MSSM-Penn-Pitt_DNA_GENOTYPE_ANCESTRY_GemTools.tsv'
walsh_gsub_path = bsmutils.get_bsmdir(
) + '/resources/walsh-manifests/genomics_subject02_template_WalshParkASD-corr.csv'
walsh_vcfs_path = bsmutils.get_bsmdir(
) + '/results/calls/filtered-vcfs-Walsh.tsv'
chess_vcfs_path = bsmutils.get_bsmdir() + '/results/calls/filtered-vcfs.tsv'

v1 = [
    'AF', 'ALT', 'BaseQRankSum', 'DP', 'FILTER/PASS', 'FS', 'GWASpval', 'REF',
    'ReadPosRankSum', 'SOR', 'VQSLOD', 'chromatinState_DLPFC', 'culprit',
    'szdbCNVcount'
]
v2 = ['Dx', 'AntipsychAtyp', 'AntipsychTyp', 'Institution', 'EV.3']


def read_clinical(ancestry=True):
    # CMC_Human_clinical_metadata.csv
    if not os.path.exists(cmc_clinical_path):
        import synapseclient
Ejemplo n.º 12
0
def get_geneset(df=pd.read_csv(
    bsmutils.get_bsmdir() + '/resources/CLOZUK/supp-table-4.csv', skiprows=7),
                col='Gene(s) tagged'):
    val = df['Gene(s) tagged'].str.split(', ').dropna().sum()
    geneset = set(val)
    return (geneset)
Ejemplo n.º 13
0
import pandas as pd
import numpy as np
import bsmutils

roadmap_rna_bname = bsmutils.get_bsmdir(
) + '/resources/roadmap-epigenomics/rna/expression/57epigenomes.'
proteinatlas_rna_bname = bsmutils.get_bsmdir(
) + '/resources/proteinatlas/expression/tissue_category_rna_brain_'


def read_roadmap_rna(kind='RPKM',
                     sampledict={
                         'E071': 'BRN.HIPP.MID',
                         'E082': 'BRN.FET.F'
                     },
                     suffix=False):
    if suffix:
        sampledict = dict(
            zip(sampledict.keys(),
                [x + '_' + kind for x in sampledict.values()]))
    fpath = roadmap_rna_bname + kind + '.pc'
    df = pd.read_csv(fpath, sep='\t', index_col=0, usecols=sampledict.keys())
    df = df.rename(sampledict, axis=1)
    return (df)


def read_roadmap_rna_RPKM_N(sampledict={
    'E071': 'BRN.HIPP.MID',
    'E082': 'BRN.FET.F'
}):
    l = [read_roadmap_rna(k, sampledict, suffix=True) for k in ['RPKM', 'N']]