Example #1
0
def signup():
    check_args(g.args, 'name', 'password')
    u = user.new_user(
        g.args['name'],
        g.args['password'],
        g.args)
    return {'message': 'signup successful', 'user': u}
Example #2
0
def user_login():
    """get a token to use for authentication throughout the rest of the site"""
    #NOTE: no permission required for this part because it uses an
    #alternative login method (username & password rather than token)
    #and declares the user object on its own
    #CONSIDER: add a delay for password based login to prevent excessive attempts

    check_args('username', 'password')
    g.user = user.auth(g.args['username'], g.args['password'],
        ip=request.remote_addr)
    if not g.user:
        raise ex.Unauthorized('Bad username or password.')

    return {
        'message': 'login successful',
        'token': g.user.token,
    }
def vote(update, context):
    logging.info('vote command called')
    context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id)
    if check_args(update, context, "/vote", ["EVENT", "LOCATION"], operator.lt, len(context.args)):
        user = update.message.from_user
        message = gen_message.gen_vote(user.first_name, context.args)
        if message:
            context.bot.send_message(chat_id=update.effective_chat.id, text=message)
Example #4
0
def user_login():
    """get a token to use for authentication throughout the rest of the site"""
    #NOTE: no permission required for this part because it uses an
    #alternative login method (username & password rather than token)
    #and declares the user object on its own
    #CONSIDER: add a delay for password based login to prevent excessive attempts

    check_args('username', 'password')
    g.user = user.auth(g.args['username'],
                       g.args['password'],
                       ip=request.remote_addr)
    if not g.user:
        raise ex.Unauthorized('Bad username or password.')

    return {
        'message': 'login successful',
        'token': g.user.token,
    }
Example #5
0
def get_patients_variants(**kwargs):
    '''
    change genotype_df's data format into a dictionary, so gnomad
    and cadd can be added.
    return result
    {
        patients:{p:[v1,v2]}
        variants:{v1:{
            gnomad_af:
            gnomad_hom_f:
            cadd:
        }}
    }
    '''
    # define result
    result = {'patients': defaultdict(list), 'variants': {}}

    compulsory_keys = {
        'genotype_df',
        'gnomad_freqs',
    }
    # check args
    helper.check_args(compulsory_keys, kwargs, 'get_patients_variants')
    # change the data structure to the result format
    for col in list(kwargs['genotype_df']):
        this = kwargs['genotype_df'][col]
        # there might be a way to merge the following two steps into one
        # + het
        result['patients'][col].extend(this[this == 1].index)
        # + hom
        result['patients'][col].extend(list(this[this == 2].index) * 2)
    result['variants'] = {
        i: {
            'gnomad_af': kwargs['gnomad_freqs'][i]['gnomad_af'],
            'gnomad_an': kwargs['gnomad_freqs'][i]['gnomad_an'],
            'gnomad_ac': kwargs['gnomad_freqs'][i]['gnomad_ac'],
            'gnomad_hom_c': kwargs['gnomad_freqs'][i]['gnomad_hom_c'],
            'gnomad_hom_f': kwargs['gnomad_freqs'][i]['gnomad_hom_f'],
            'cadd': None,
        }
        for i in kwargs['genotype_df'].index
    }
    return result
def training(update, context):
    logging.info('training command called')
    context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id)
    if check_args(update, context, "/training", ["[LOCATION]"], operator.lt, len(context.args)):
        user = update.message.from_user
        message = gen_message.gen_training(user.first_name, context.args)
        keyboard = [[InlineKeyboardButton("Join", callback_data="train" + str(user.first_name))]]
        reply_markup = InlineKeyboardMarkup(keyboard)
        if message:
            context.bot.send_message(chat_id=update.effective_chat.id, text=message, reply_markup=reply_markup)
def notraining(update, context):
    logging.info('notraining command called')
    context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id)
    user = update.message.from_user
    message = ""
    if len(context.args) == 0:
        message = gen_message.gen_notraining_0(user.first_name)
    elif check_args(update, context, "/notraining", ["TIME", "LOCATION"], operator.lt, len(context.args)):
        message = gen_message.gen_notraining(user.first_name, context.args)
    if message:
        context.bot.send_message(chat_id=update.effective_chat.id, text=message)
def rm_vote(update, context):
    context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id)
    logging.info('removevote command called')
    user = update.message.from_user
    message = ""
    if len(context.args) == 0:
        message = gen_message.gen_rm_vote_0(user.first_name)
    elif check_args(update, context, "/removevote", ["EVENT"], operator.lt, len(context.args)):
        message = gen_message.gen_rm_vote(user.first_name, context.args)
    if message:
        context.bot.send_message(chat_id=update.effective_chat.id, text=message)
def notomorrowtraining(update, context):
    logging.info('notomorrowtraining command called')
    context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id)
    user = update.message.from_user
    message = ""
    if len(context.args) == 0:
        message = gen_message.gen_notraining_0(user.first_name,
                                               day=f"{date.today() + timedelta(days=1)}.train", daystring=" tomorrow")
    elif check_args(update, context, "/notomorrowtraining", ["TIME", "LOCATION"], operator.lt, len(context.args)):
        message = gen_message.gen_notraining(user.first_name, context.args,
                                             day=f"{date.today() + timedelta(days=1)}.train", daystring="tomorrow ")
    if message:
        context.bot.send_message(chat_id=update.effective_chat.id, text=message)
def tomorrowtraining(update, context):
    logging.info('tomorrowtraining command called')
    context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id)
    if check_args(update, context, "/tomorrowtraining", ["TIME", "LOCATION"], operator.lt, len(context.args)):
        user = update.message.from_user
        reply_markup = None
        message = gen_message.gen_training(user.first_name, context.args,
                                           day=f"{date.today() + timedelta(days=1)}.train", daystring="tomorrow ")
        if message != "Please specify time in the format HH:MM":
            keyboard = [[InlineKeyboardButton("Join", callback_data="train" + str(user.first_name))]]
            reply_markup = InlineKeyboardMarkup(keyboard)
        if message:
            context.bot.send_message(chat_id=update.effective_chat.id, text=message, reply_markup=reply_markup)
Example #11
0
def main(**kwargs):
    '''
    parameters:
     genes: optional
     N (selecting HPO with at least N Ph. affecting both \
          #positive (selecting parental HPO in the positive set \
          #and negative set)
     vcf file location
     gnomad files location
     patient_mini, patient_info, both are json files
     cadd path
     unrelated file used to subset vcf file
     v cutoff and p cutoff are to remove variants and patients with \
          #low coverage over the gene

    returns hpo goodness of fit score, p_g (gnomad_freq
    '''
    # check args
    compulsory_keys = {
        'N',
        'gnomad_path',
        'patient_mini_file',
        'patient_info_file',
        'unrelated_file',
        'gnomad_cutoff',
        'gnomad_step',
        'gnomad_path',
        'hpo_mask',
        'cadd_step',
        'cadd_min',
    }
    helper.check_args(compulsory_keys, kwargs, 'main')
    # defaults
    kwargs.setdefault('gene_inheritance_mode', {})
    # get patient_mini and patient_info
    patient_info = helper.get_snapshot(kwargs['patient_info_file'])
    # get p_h for all hpos
    phs = helper.get_phs(patient_info)

    # get patient_map
    patient_map = PM.main(**kwargs)
    if patient_map is None:
        return None

    modes = 'rd'
    # translate patient_map's key
    pm = {}
    for m in modes:
        pm[m] = {}
        for k, v in patient_map['patient_map'][m].items():
            key = tuple([int(i) for i in k.split(',')])
            pm[m][key] = v
    phenogenon_cache = {'r': {}, 'd': {}}
    # get phenogenon sums on the first gnomad bin.
    # get all testable hpos
    hpos = [
        i for i, v in phs.items()
        if v >= kwargs['N'] and i not in kwargs['hpo_mask']
    ]
    for hpo in hpos:
        # inheritance mode: r and d
        # Note that for each HPO, it only keeps the inheritance mode
        #  with the higher hgf score

        for mode in modes:
            args = dict(
                hpos=hpo,
                mode=mode,
                patient_info=patient_info,
                patient_map=pm[mode],
            )

            genon = helper.phenogenon(**args)

            phenogenon_cache[mode][hpo] = genon.tolist()

    return {
        'phenogenon': phenogenon_cache,
        'NP': patient_map['NP'],
        'patient_map': patient_map['patient_map'],
        'patients_variants': patient_map['patients_variants'],
    }
Example #12
0
def main(**kwargs):
    '''
    parameters:
     genes: optional
     N (selecting HPO with at least N Ph. affecting both \
          #positive (selecting parental HPO in the positive set \
          #and negative set)
     vcf file location
     gnomad files location
     patient_mini, patient_info, both are json files
     cadd path
     unrelated file used to subset vcf file
     v cutoff and p cutoff are to remove variants and patients with \
          #low coverage over the gene

    returns hpo goodness of fit score, p_g (gnomad_freq
    '''
    # check args
    compulsory_keys = {
        'remove_nc',
        'vcf_file',
        'gnomad_path',
        'cadd_file',
        'patient_mini_file',
        'patient_info_file',
        'human_fasta_ref',
        'unrelated_file',
        'v_cutoff',
        'p_cutoff',  # not using this since no phasing is done
        'gnomad_cutoff',
        'gnomad_step',
        'gnomad_path',
        'cadd_step',
        'cadd_min',
        'cis_gap',
    }
    helper.check_args(compulsory_keys, kwargs, 'main')
    # defaults
    kwargs.setdefault('gene_inheritance_mode', {})
    # get patient_mini and patient_info
    gnomad_steps = np.arange(0,
                             kwargs['gnomad_cutoff'] + kwargs['gnomad_step'],
                             kwargs['gnomad_step'])
    cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step'])

    # for each gene, get all valid variants/patients according to p/v_cutoff,
    # annotate using gnomad
    # then get patients_variants, with variants annotated with
    #  gnomad freqs and cadd
    patients_variants = PV.main(**kwargs)

    # get patient_map for recessive and dominant modes
    args = dict(
        data=patients_variants,
        vcf=patients_variants['cover_df'],
        gnomad_steps=gnomad_steps,
        cadd_steps=cadd_steps,
        cis_gap=kwargs['cis_gap'],
    )
    patient_map = {'r': {}, 'd': {}}
    # first get mode if provided. Note that the keys could be id
    #  or gene name
    modes = 'rd'

    # get number of patients who carry rare variants when get patient_maps
    NP = {}
    for mode in modes:
        args['mode'] = mode
        M = helper.get_patient_map(**args)
        NP[mode] = len(
            set(
                list(
                    itertools.chain.from_iterable(
                        [v[0] for k, v in M.items() if k[1] == 0]))))
        # change the keys to a string
        for k, v in M.items():
            patient_map[mode]['{},{}'.format(k[0],
                                             k[1])] = [list(v[0]),
                                                       list(v[1])]

    return {
        'patient_map': patient_map,
        'patients_variants': patients_variants,
        'NP': NP,
    }
def main(**kwargs):
    '''
    parameters:
     genes: optional
     N (selecting HPO with at least N Ph. affecting both \
          #positive (selecting parental HPO in the positive set \
          #and negative set)
     vcf file location
     gnomad files location
     patient_mini, patient_info, both are json files
     cadd path
     unrelated file used to subset vcf file
     v cutoff and p cutoff are to remove variants and patients with \
          #low coverage over the gene

    returns hpo goodness of fit score, p_g (gnomad_freq 
    '''
    # check args
    compulsory_keys = {
        'N',
        'gnomad_path',
        'patient_mini_file',
        'patient_info_file',
        'unrelated_file',
        'gnomad_cutoff',
        'gnomad_step',
        'gnomad_path',
        'hpo_mask',
        'cadd_step',
        'cadd_min',
        'output',
        }
    helper.check_args(compulsory_keys, kwargs, 'main')
    # defaults
    kwargs.setdefault('gene_inheritance_mode',{})
    # output already exist?
    if os.path.isfile(kwargs['output']):
        print('already done')
        return None
    # get patient_mini and patient_info
    patient_info = helper.get_snapshot(kwargs['patient_info_file'])
    patient_mini = helper.get_snapshot(kwargs['patient_mini_file'])
    # get p_h for all hpos
    phs = helper.get_phs(patient_info)
    # add cohort info into patient_mini
    all_p = MONGO['patient_db'].patients.find({'external_id':{'$in':patient_mini.keys()}},{'external_id':1,'contact':1})
    for i in all_p:
        # !!!! this belongs to UCLex's problem!!! remove if publish
        # JingYu and BLACK to UKIRDC, KELSELL to DavidKelsell
        contactdict = dict(
                JingYu = 'UKIRDC',
                Black = 'UKIRDC',
                KELSELL = 'DavidKelsell',
                TonySegal = 'SEGAL',
                SanjaySisodiya = 'SISODIYA',
                )
        contact = i['contact']['user_id']
        contact = contactdict.get(contact,contact)
        patient_mini[i['external_id']] = {'hpo': patient_mini[i['external_id']],
                                          'contact': contact}
    # get hpodb from json
    hpo_db = get_hpo_from_json('../tests/data/new-hpo-hpo.json')
    # get genes, if not provided. get all gene_ids from mongodb, \
            #if provided, convert to gene_id
    fields = {
            'gene_id':1,
            'gene_name':1,
            '_id':0,
            'chrom':1,
            'start':1,
            'stop':1,
            'xstart':1,
            'xstop':1,
            }
    this = {}

    gene_ranges = get_chrom_genes(kwargs['chrom'], fields, MONGO['phenopolis_db'])
    # get gnomad and cadd steps
    gnomad_steps = np.arange(
            0,
            kwargs['gnomad_cutoff']+kwargs['gnomad_step'],
            kwargs['gnomad_step']
            )
    cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step'])

    # get patient_maps
    with open(
            os.path.join(
                kwargs['patient_maps_path'],
                '{}.json'.format(kwargs['chrom'])
            ),
            'r',
    ) as inf:
        patient_maps = json.load(inf)

    # for each gene, get all valid variants/patients according to p/v_cutoff, 
    # annotate using gnomad
    result = {}
    number_processed = 0
    coding_variants = None
    outf = open(kwargs['output'], 'w')
    outf.write('{')
    for gene_range in gene_ranges:
        # get patient_map
        patient_map = patient_maps.pop(gene_range['gene_id'], None)
        if patient_map is None:
            continue
        # print progress
        number_processed += 1
        if not number_processed % 100:
            print('===processed {} genes==='.format(number_processed))
        print('processing {}'.format(gene_range['gene_name']))

        modes = kwargs['gene_inheritance_mode'].get(
                gene_range['gene_name'],
                kwargs['gene_inheritance_mode'].get(
                    gene_range['gene_id'],
                    'rd'
                    )
                )

        # translate patient_map's key
        pm = {}
        for m in modes:
            pm[m] = {}
            for k,v in patient_map['patient_map'][m].items():
                key = tuple([int(i) for i in k.split(',')])
                pm[m][key] = v
        phenogenon_cache = {'r':{},'d':{}}
        # get phenogenon sums on the first gnomad bin.
        # get all testable hpos
        hpos_to_test = None
        if kwargs['hpos_to_test']:
            hpos_to_test = [i for i in kwargs['hpos_to_test']
                    if i not in kwargs['hpo_mask']]
        else:
            hpos_to_test = [i for i,v in phs.items() 
                    if v >= kwargs['N'] 
                    and i not in kwargs['hpo_mask']]
        for hpos in hpos_to_test:
            # inheritance mode: r and d
            # Note that for each HPO, it only keeps the inheritance mode
            #  with the higher hgf score

            for mode in modes:
                args = dict(
                        hpos = hpos,
                        mode = mode,
                        patient_info = patient_info,
                        patient_map = pm[mode],
                        )

                genon =  helper.phenogenon(**args)

                phenogenon_cache[mode][hpos] = genon.tolist()

        output = json.dumps({
            gene_range['gene_id']:{
                'symbol': gene_range['gene_name'],
                'phenogenon': phenogenon_cache,
                'NP': patient_map['NP'],
            }
        })
        # strip off the braces
        output = output[1:-1]
        if number_processed != 1:
            # meaning not the first record. add a comma
            output = ',' + output
        outf.write(output)

    # close cursor
    gene_ranges.close()

    outf.write('}')

    outf.close()
Example #14
0
def main(**kwargs):
    '''
    parameters:
     genes: optional
     N (selecting HPO with at least N Ph. affecting both \
          #positive (selecting parental HPO in the positive set \
          #and negative set)
     vcf file location
     gnomad files location
     patient_mini, patient_info, both are json files
     cadd path
     unrelated file used to subset vcf file
     v cutoff and p cutoff are to remove variants and patients with \
          #low coverage over the gene

    returns hpo goodness of fit score, p_g (gnomad_freq 
    '''
    # check args
    compulsory_keys = {
        'remove_nc',
        'vcf_file',
        'gnomad_path',
        'cadd_file',
        'patient_mini_file',
        'patient_info_file',
        'human_fasta_ref',
        'unrelated_file',
        'v_cutoff',
        'p_cutoff',  # not using this since no phasing is done
        'gnomad_cutoff',
        'gnomad_step',
        'gnomad_path',
        'cadd_step',
        'cadd_min',
        'genon_sum_cutoff_coefficient',
        'cis_gap',
        'output',
    }
    helper.check_args(compulsory_keys, kwargs, 'main')
    # defaults
    kwargs.setdefault('gene_inheritance_mode', {})
    # get patient_mini and patient_info
    patient_info = helper.get_snapshot(kwargs['patient_info_file'])
    patient_mini = helper.get_snapshot(kwargs['patient_mini_file'])
    # get p_h for all hpos
    phs = helper.get_phs(patient_info)
    # add cohort info into patient_mini
    all_p = MONGO['patient_db'].patients.find(
        {'external_id': {
            '$in': patient_mini.keys()
        }}, {
            'external_id': 1,
            'contact': 1
        })
    for i in all_p:
        # !!!! this belongs to UCLex's problem!!! remove if publish
        # JingYu and BLACK to UKIRDC, KELSELL to DavidKelsell
        contactdict = dict(
            JingYu='UKIRDC',
            Black='UKIRDC',
            KELSELL='DavidKelsell',
            TonySegal='SEGAL',
            SanjaySisodiya='SISODIYA',
        )
        contact = i['contact']['user_id']
        contact = contactdict.get(contact, contact)
        patient_mini[i['external_id']] = {
            'hpo': patient_mini[i['external_id']],
            'contact': contact
        }
    # get genes, if not provided. get all gene_ids from mongodb, \
    #if provided, convert to gene_id
    fields = {
        'gene_id': 1,
        'gene_name': 1,
        '_id': 0,
        'chrom': 1,
        'start': 1,
        'stop': 1,
        'xstart': 1,
        'xstop': 1,
    }
    this = {}
    if kwargs.get('genes', None) is not None:
        genes = phenopolis_utils.symbols_to_ids(kwargs['genes'],
                                                MONGO['phenopolis_db'])
        this = {'gene_id': {'$in': genes}}

    # sometimes the cursor times out.
    # but do remember to close it
    if kwargs.get('chrom', None) is not None:
        gene_ranges = get_chrom_genes(kwargs['chrom'], fields,
                                      MONGO['phenopolis_db'])
        #gene_ranges = get_chrom_genes_with_jq(kwargs['chrom'],kwargs['uclex_genes_json'])
    else:
        gene_ranges = MONGO['phenopolis_db'].genes.find(this,
                                                        fields,
                                                        no_cursor_timeout=True)
    # get gnomad and cadd steps
    gnomad_steps = np.arange(0,
                             kwargs['gnomad_cutoff'] + kwargs['gnomad_step'],
                             kwargs['gnomad_step'])
    cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step'])

    # for each gene, get all valid variants/patients according to p/v_cutoff,
    # annotate using gnomad
    result = {}
    number_processed = 0
    last_chrom = None
    coding_variants = None
    for gene_range in gene_ranges:
        # print progress
        number_processed += 1
        if not number_processed % 100:
            print('===processed {} genes==='.format(number_processed))
        print('processing {}'.format(gene_range['gene_name']))
        # first parse vcf file to get genotype and coverage for each variant
        vcf_file = kwargs['vcf_file'].format(gene_range['chrom'])
        args = dict(
            vcf_file=vcf_file,
            chrom=gene_range['chrom'],
            start=gene_range['start'],
            stop=gene_range['stop'],
            unrelated_file=kwargs['unrelated_file'],
            human_fasta_ref=kwargs['human_fasta_ref'],
            v_cutoff=kwargs['v_cutoff'],
            p_cutoff=kwargs['p_cutoff'],
            gnomad_path=kwargs['gnomad_path'],
            gnomad_cutoff=kwargs['gnomad_cutoff'],
            patient_mini=patient_mini,
        )
        vcf_dfs = get_vcf_df(**args)
        if vcf_dfs is None:
            # no variants, continue
            continue

        # get coding variants if last_chrom != this_chrom
        if kwargs['remove_nc'] and gene_range['chrom'] != last_chrom:
            coding_variant_file = '/cluster/project8/vyp/JingYu/git/phenopolis_analysis/data/public/vcf/chr{}.coding.tsv'.format(
                gene_range['chrom'])
            coding_variants = get_coding_variants(coding_variant_file)
        last_chrom = gene_range['chrom']
        'genon_sum_cutoff_coefficient',
        genotype_df, cover_df, gnomad_freqs = vcf_dfs
        # then get patients_variants, with variants annotated with
        #  gnomad freqs and cadd
        args = dict(
            gnomad_freqs=gnomad_freqs,
            genotype_df=genotype_df,
        )
        patients_variants = get_patients_variants(**args)
        # remove noncoding?
        if kwargs['remove_nc']:
            print(len(patients_variants['variants']))
            helper.remove_noncoding(gene_range['gene_id'], patients_variants,
                                    coding_variants)
            print(len(patients_variants['variants']))
        # if no variants left, skip
        if not patients_variants['variants']: continue
        # for each gene, remove batch-specific variants
        args = dict(
            data=patients_variants,
            patient_mini=patient_mini,
        )
        batch_specific_variants = helper.get_batch_artefacts(**args)
        patients_variants = helper.remove_batch_artefacts(
            patients_variants,
            batch_specific_variants,
            patient_mini,
        )
        # add cadd
        args = dict(
            variants=patients_variants['variants'],
            chrom=gene_range['chrom'],
            start=gene_range['start'],
            stop=gene_range['stop'],
            cadd_file=kwargs['cadd_file'],
        )
        cadds = helper.add_cadd(**args)
        for k, v in cadds.items():
            patients_variants['variants'][k]['cadd'] = v

        # when two variants are in cis and both appear in one patient,
        # discard the variant with lower cadd in that patient
        # example SDK1 ('7-3990565-C-T','7-4014039-C-T')
        # for now, only focus on variants with hom_f < 0.00025
        remove_cis(patients_variants, genotype_df)

        # get patient_map for recessive and dominant modes
        args = dict(
            data=patients_variants,
            vcf=cover_df,
            gnomad_steps=gnomad_steps,
            cadd_steps=cadd_steps,
            cis_gap=kwargs['cis_gap'],
        )
        patient_map = {'r': {}, 'd': {}}
        # first get mode if provided. Note that the keys could be id
        #  or gene name
        modes = kwargs['gene_inheritance_mode'].get(
            gene_range['gene_name'],
            kwargs['gene_inheritance_mode'].get(gene_range['gene_id'], 'rd'))

        # get number of patients who carry rare variants when get patient_maps
        NP = {}
        for mode in modes:
            args['mode'] = mode
            M = helper.get_patient_map(**args)
            NP[mode] = len(
                set(
                    list(
                        itertools.chain.from_iterable(
                            [v[0] for k, v in M.items() if k[1] == 0]))))
            # change the keys to a string
            for k, v in M.items():
                patient_map[mode]['{},{}'.format(
                    k[0], k[1])] = [list(v[0]), list(v[1])]

        result[gene_range['gene_id']] = {
            'symbol': gene_range['gene_name'],
            'patient_map': patient_map,
            'NP': NP,
        }
    # close cursor
    gene_ranges.close()

    # write everything to output
    with open(kwargs['output'], 'wb') as outf:
        json.dump(result, outf)
Example #15
0
def get_vcf_df(**kwargs):
    '''
    use bcf tools to subset variants and patients. then according to 
    p/v_cutoff to get bad_vs, bad_ps to remove
    '''
    compulsory_keys = {
        'vcf_file',
        'chrom',
        'start',
        'stop',
        'unrelated_file',
        'human_fasta_ref',
        'v_cutoff',
        'gnomad_cutoff',
        'p_cutoff',
        'patient_mini',
    }
    # check args
    helper.check_args(compulsory_keys, kwargs, 'get_vcf_df')
    position = '{chrom}:{start}-{stop}'.format(**kwargs)
    ps1 = subprocess.Popen(('tabix', '-h', kwargs['vcf_file'], position),
                           stdout=subprocess.PIPE)
    # subset on unrelated samples, and normalise
    ps2 = subprocess.Popen(('bcftools', 'view', '-Ou', '-S',
                            kwargs['unrelated_file'], '-f', 'PASS'),
                           stdin=ps1.stdout,
                           stdout=subprocess.PIPE)
    ps3 = subprocess.Popen(('bcftools', 'norm', '-Ou', '-m', '-any'),
                           stdin=ps2.stdout,
                           stdout=subprocess.PIPE)
    normed_vcf = subprocess.check_output(
        ['bcftools', 'norm', '-Ov', '-f', kwargs['human_fasta_ref']],
        stdin=ps3.stdout)
    # get vcf df. genotype -1 = missing, 0 = wildtype, 1 = het, 2 = hom
    genotype_df = read_vcf(normed_vcf)
    # empty vcf? early return
    if genotype_df.empty:
        return None
    # get poorly covered variants and individuals
    # change df to cover_df
    cover_df = genotype_df.copy()
    cover_df[cover_df >= 0] = 1
    cover_df[cover_df == -1] = 0
    pm = cover_df.mean()

    # rid of patients not in patient_mini
    bad_ps = set(pm[pm < kwargs['p_cutoff']].index)
    bad_ps.update(set(pm.index) - set(kwargs['patient_mini'].keys()))
    vm = cover_df.T.mean()
    bad_vs = set(vm[vm < kwargs['v_cutoff']].index)
    # annotate vs with gnomad
    vs = (i for i in vm.index if i not in bad_vs)
    gnomad_freqs = gnomad_utils.overall_freqs(vs, kwargs['gnomad_path'])

    # remove variants with 'SEGDUP' filter. This gives a lot of noise for recessive
    # analysis. For example IGHV3-38 - ENST00000390618, 14-106866588-T-C
    bad_vs.update([
        i for i, v in gnomad_freqs.items() if v['filters']['exome'] is not None
        and 'SEGDUP' in v['filters']['exome'] or v['filters']['genome']
        is not None and 'SEGDUP' in v['filters']['genome']
    ])
    # in fact, many variants have very high af, but 0 hom_f, such as
    # 6-32548641-A-T, which has no 'SEGDUP' filter. Remove those
    # hard filtering for the time being. There might be better ways
    bad_vs.update([
        i for i, v in gnomad_freqs.items()
        if v['gnomad_af'] > 0.01 and v['gnomad_hom_f'] == 0.0
    ])

    # add to bad_vs gnomad_hom_af >= gnomad_cutoff,
    #  and those not covered by gnomad_path
    # Note that if gnomad_hom_af >= gnomad_cutoff, then gnomad_af >= gnomad_cutoff
    #  but not vice versa
    #this = [i for i,v in gnomad_freqs.items()
    #    if v['gnomad_af'] is None or v['gnomad_hom_f'] >= kwargs['gnomad_cutoff']]
    bad_vs.update([
        i for i, v in gnomad_freqs.items() if v['gnomad_af'] is None
        or v['gnomad_hom_f'] >= kwargs['gnomad_cutoff']
    ])
    vs_count = np.sum(genotype_df[genotype_df > 0], axis=1)
    bad_vs.update([
        i for i in gnomad_freqs
        if vs_count[i] > 3 and gnomad_freqs[i]['pop_filter']
    ])
    # then drop bad_ps and bad_vs
    genotype_df.drop(bad_vs, inplace=True)
    genotype_df.drop(bad_ps, inplace=True, axis=1)
    return (genotype_df, cover_df, gnomad_freqs)
Example #16
0
def main(**kwargs):
    '''
    parameters:
     genes: optional
     N (selecting HPO with at least N Ph. affecting both \
          #positive (selecting parental HPO in the positive set \
          #and negative set)
     vcf file location
     gnomad files location
     patient_mini, patient_info, both are json files
     cadd path
     unrelated file used to subset vcf file
     v cutoff and p cutoff are to remove variants and patients with \
          #low coverage over the gene

    returns hpo goodness of fit score, p_g (gnomad_freq
    '''
    # check args
    compulsory_keys = {
        'remove_nc',
        'vcf_file',
        'gnomad_path',
        'cadd_file',
        'patient_mini_file',
        'patient_info_file',
        'human_fasta_ref',
        'unrelated_file',
        'v_cutoff',
        'p_cutoff',  # not using this since no phasing is done
        'gnomad_cutoff',
        'gnomad_step',
        'gnomad_path',
        'cadd_step',
        'cadd_min',
        'cis_gap',
    }
    helper.check_args(compulsory_keys, kwargs, 'main')
    # defaults
    kwargs.setdefault('gene_inheritance_mode', {})
    # get patient_mini and patient_info
    patient_info = helper.get_snapshot(kwargs['patient_info_file'])
    patient_mini = helper.get_snapshot(kwargs['patient_mini_file'])
    # get p_h for all hpos
    phs = helper.get_phs(patient_info)
    # get genes, if not provided. get all gene_ids from mongodb, \
    #if provided, convert to gene_id
    fields = {
        'gene_id': 1,
        'gene_name': 1,
        '_id': 0,
        'chrom': 1,
        'start': 1,
        'stop': 1,
        'xstart': 1,
        'xstop': 1,
    }
    this = {}
    # get gnomad and cadd steps
    gnomad_steps = np.arange(0,
                             kwargs['gnomad_cutoff'] + kwargs['gnomad_step'],
                             kwargs['gnomad_step'])
    cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step'])

    # for each gene, get all valid variants/patients according to p/v_cutoff,
    # annotate using gnomad
    # use PV to record patients_variants
    chrom, crange = kwargs['range'].split(':')
    start, stop = crange.split('-')
    # first parse vcf file to get genotype and coverage for each variant
    vcf_file = kwargs['vcf_file'].format(chrom)
    args = dict(
        vcf_file=vcf_file,
        chrom=chrom,
        start=start,
        stop=stop,
        unrelated_file=kwargs['unrelated_file'],
        human_fasta_ref=kwargs['human_fasta_ref'],
        v_cutoff=kwargs['v_cutoff'],
        p_cutoff=kwargs['p_cutoff'],
        gnomad_path=kwargs['gnomad_path'],
        gnomad_cutoff=kwargs['gnomad_cutoff'],
        patient_mini=patient_mini,
    )
    vcf_dfs = get_vcf_df(**args)
    if vcf_dfs is None:
        # no variants, continue
        return None

    genotype_df, cover_df, gnomad_freqs = vcf_dfs
    # then get patients_variants, with variants annotated with
    #  gnomad freqs and cadd
    args = dict(
        gnomad_freqs=gnomad_freqs,
        genotype_df=genotype_df,
    )
    patients_variants = get_patients_variants(**args)
    # remove noncoding?
    if kwargs['remove_nc']:
        helper.remove_noncoding(patients_variants, kwargs)
    # if no variants left, skip
    if not patients_variants['variants']:
        return None
    # for each gene, remove batch-specific variants
    args = dict(
        data=patients_variants,
        patient_mini=patient_mini,
    )
    batch_specific_variants = helper.get_batch_artefacts(**args)
    patients_variants = helper.remove_batch_artefacts(
        patients_variants,
        batch_specific_variants,
        patient_mini,
    )
    # add cadd
    args = dict(
        variants=patients_variants['variants'],
        chrom=chrom,
        start=start,
        stop=stop,
        cadd_file=kwargs['cadd_file'],
    )
    cadds = helper.add_cadd(**args)
    for k, v in cadds.items():
        patients_variants['variants'][k]['cadd'] = v

    # when two variants are in cis and both appear in one patient,
    # discard the variant with lower cadd in that patient
    # example SDK1 ('7-3990565-C-T','7-4014039-C-T')
    # for now, only focus on variants with hom_f < 0.00025
    remove_cis(patients_variants, genotype_df)
    patients_variants['cover_df'] = cover_df

    # output patients_variants
    return patients_variants
Example #17
0
def signup():
    check_args(g.args, 'name', 'password')
    u = user.new_user(g.args['name'], g.args['password'], g.args)
    return {'message': 'signup successful', 'user': u}