def get_flanking_positions(obj_tt, genomic_range, flank_type):
    """
    Find the position flanking the genomic alteration
    -if mutation, then complete_rf = True for codons flanking codon affected
    -if insertion, then complete_rf = False, need to retrieve codons immediately before & after genomic alteration
    -if deletion, then complete_rf = False, need to retrieve codons immediately before & after genomic alteration
    Args:
        -obj_tt
        -genomic_range = string that is genomic range of genomic alteration ()
        -alt_type = integer that is the "flank_type", either to stay on the position in 'genomic_range' or to get the position before & after the genomic alteration (-1 & +1 for before & after genomic alteration, respectively)
            -0 = stay on genomic position of genomic alteration
                -should do this for SNVs and events where genomic alteration does not alter the reading frame.
            -1 = retrieve the position before (-1) & after (+1)
                -should do this for frameshifting events as the reading frame has been altered
    Returns:
        returns a string 
    """
    hash_pos = Isoform.split_genome_pos(genomic_range)
    if flank_type == 0:
        return hash_pos
    else:
        return {
            'chrom': hash_pos['chrom'],
            'start': hash_pos['start'] - 1,
            'end': hash_pos['end'] + 1
        }
def create_obj_tt(isoform_id, genome_pos):
    """
    Creates an instance of TranslateTranscript
    Args:
        -isoform_id = string that is the isoform, usually in the form of an Ensembl ID (e.g. ENST000..)
        -row = from pandas Dataframe, a row from the file contain mutation position
    """
    ##TEST:: print "MAIN: start of cott: isoform_id = ", isoform_id

    db_type = 2  #this means the database is Ensembl
    hash_gp = Isoform.split_genome_pos(genome_pos)  #hash_gp = hash genome pos
    hash_pos = {
        'chrom': 'chr' + str(hash_gp['chrom']),
        'pos_oi': hash_gp['start']
    }
    iso_sj = IsoformSJ(db_type, isoform_id, [], -10, hash_pos, False, 0, True)
    canon_transcript = iso_sj.create_canon_transcript(False, False)

    obj_tt = TranslateTranscript(canon_transcript, iso_sj, DIR_GENOME, {})

    return obj_tt
Exemple #3
0
        print "Body: at row ", i, " - taking a ", sec_break, " second break (Apparently Ensembl REST API only allows 15 requests per second)"
        time.sleep(sec_break)


print "------------ Algorithm: 180825_ProcessGenomeAlt_V2.py ------------"
"""
Algorithm: Determine the consequence of a genomic alteration & records it in file. This also retrieves the nucleotides before & after the position of the genomic alteration

PROTOCOL: open file that contains genomic alterations -> calculate the consequence of each mutation by performing a VEP request -> record the results of the request (position, reading frame, relative CDS position, )
#LATER: retrieve the AA sequence before & after the genomic alteration
"""

start_time = time.time()

g = Genome('sqlite:////tmp/hg19_v2.db')
Isoform.set_cruzdb(g)
"""
mode_calc_exp -> will be used to determine if the gene expression should be calculated or not, where 0 = do not calculate gene expression, 1 = calculate gene expression.
"""
mode_calc_exp = 0  #use this if I do not want to consider thresholding by gene expression percentile. Also, this may include all occurrences of mutations, including non-coding?
# mode_calc_exp = 1     #use this to consider thresholding by gene expression percentile

#retrieve user-inputted parameters
arg_date_output = sys.argv[1]
thres_express = int(
    sys.argv[2])  #threshold for the gene expression percentile to accept
path_velip_file = sys.argv[3]
output_dir = sys.argv[4]
is_seq_WGS = sys.argv[5]

##MAY DELETE, I DO NOT NEED THIS BECAUSE OF "path_velip_file"
# from SVSv7 import Isoform, IsoformSJ, TranscribeTranscript, TranslateTranscript, EnsemblVEP
from SVSv7 import Isoform, SimpleNeoepitopeAllV2, SimpleNeoepitopeIsoformV2
from mokhaPy import mokhaPy

#Constants - directories
DIR_PROJ = "/home/mokha/Documents/Krauthammer_Lab"
DIR_CURR = DIR_PROJ + "/PythonClasses/SVSv7"
DIR_DATA = DIR_CURR + "/TestData"
DIR_RESULTS = DIR_CURR + "/TestResults"

DIR_GENOME = DIR_PROJ + '/ArchiveData/hg19.fa'      #directory for samtool-indexed genome

print "------------ TDD: 171108_SimpleNeoepV2_NMD.py ------------"

g = Genome( 'sqlite:////tmp/hg19_v2.db' )
Isoform.set_cruzdb( g )

##IMPORTANT TEST - THE "X" AMINO ACID
# #simulate point mutation - the genomic range for this "X" amino acid is "X:153146127-153146128"
# isoform_id = "ENST00000452593"
# # genomic_range = "X:153146127-153146128"     #this is the range of the "X" amino acid
# genomic_range = "X:153146127-153146127"
# orig = None
# alt = "T"

##MUTATIONS - minus gene
# #simulate point mutation - RESULT: codon =  aAt/at & amino acids =  N/X
# isoform_id = "ENST00000376887"
# # genomic_range = "13:95815411-95815411"
# genomic_range = "13:95953564-95953564"
# orig = None