Beispiel #1
0
def _collectlocations(rnalist):
    """Return a list of RNA locations.
    ----------Function Input----------
    <rnalist> is a list of SmallFeature objects that represent RNAs
    
    ----------Function Output----------
    This function puts all of the RNA locations into a list and returns that
    list. Format of location list:
        [[chrm1],[chrm2],...,[chrm23,[chrm24]]
    Where a given chromosome consists of [int,int],[int,int],...,[int,int],
    that is, a bunch of integer intervals representing the locations of the
    RNAs on that chromosome.
    Note: chromosome X = chromosome 23 {index 22 of location list};
    chromosome Y = chromosome 24 {index 23 of location list}.
    Only autosome will be used later so X and Y are actually irrelevant."""
    for chromo in rnalist:
        GTFparser_general.giveranger(chromo)
    locations=[[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],
        [],[],[]]
    #Careful: index 0 corresponds to chromosome 1
    chrcounter=0
    while chrcounter<22:
        for RNA in rnalist[chrcounter]:
            locations[chrcounter].append(RNA.ranger)
        chrcounter+=1
    return locations
Beispiel #2
0
def _collectlocations(rnalist):
    """Return a list of RNA locations.
    ----------Function Input----------
    <rnalist> is a list of SmallFeature objects that represent RNAs
    
    ----------Function Output----------
    This function puts all of the RNA locations into a list and returns that
    list. Format of location list:
        [[chrm1],[chrm2],...,[chrm23,[chrm24]]
    Where a given chromosome consists of [int,int],[int,int],...,[int,int],
    that is, a bunch of integer intervals representing the locations of the
    RNAs on that chromosome.
    Note: chromosome X = chromosome 23 {index 22 of location list};
    chromosome Y = chromosome 24 {index 23 of location list}.
    Only autosome will be used later so X and Y are actually irrelevant."""
    for chromo in rnalist:
        GTFparser_general.giveranger(chromo)
    locations = [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
                 [], [], [], [], [], [], [], [], []]
    #Careful: index 0 corresponds to chromosome 1
    chrcounter = 0
    while chrcounter < 22:
        for RNA in rnalist[chrcounter]:
            locations[chrcounter].append(RNA.ranger)
        chrcounter += 1
    return locations
Beispiel #3
0
def notintergenic(pathtoconfig,filename):
    #Tested in "SECOND TEST" as described in "10-15-13 testing categorizeSNPs
    #with fake files.docx"
    """Create a pickled Python list that describes 'notintergenic regions'
    
    ----------Function Output----------
    Creates a nested list of locationsin the same format as the output of
    _collectlocations() for regions that are going to be excluded from
    "intergenic" classification. The nested list is saved as a pickled Python
    list in the file "filename" in the same directory as the configuration file
    (specified by <pathtoconfig>) The choice of what to include in
    "notintergenic" is based on Hangauer paper. Within each chromosome the
    locations have been collapsed such that the minimum number of intervals
    is reported (to help speed up the process of determining whether a SNP falls
    in a 'notintergenic' region.)
    Regions included as "notintergenic":
        RefSeq NR genes (RNA) and XR genes (RNA predicted model)
        Gencode: everything in version 18
        Hangauer's extended protein coding gene structures derived from RNA-seq
        data (downloaded as S1)
        Pseudogenes from Yale
    Note: This is based on what Hangauer et al did but with a few differences,
    including: The "mappability" of the genome is not considered here
    (Hangauer only considered mappable regions); H-Invitational data is not
    used; alternative and extended 5' and 3' UTRs from UTRdb are not used;
    everything in Gencode was used instead of just "genes"
    Note: "the default human gene set in the Ensembl browser is therefore also
    the current version of GENCODE." Which is why it's weird to me that
    Hangauer mentions Gencode and Ensembl separately...?"""
    confFile=open(pathtoconfig,'r')
    confFilepathaslist=pathtoconfig.rsplit("/") #should separator ever be "\\"?
    confFiledirpath="/".join(confFilepathaslist[:-1])
    mRNApath=""; gencodepath=""; hang1path=""; pseudpath=""
    for line in confFile:
        if line[0]!="#":
            lineaslist=line.rsplit(" ")
            if lineaslist[0]=="mRNApath:":
                mRNApath=lineaslist[1]
                mRNApath=stripnewlineEnd(mRNApath)
            elif lineaslist[0]=="gencodepath:":
                gencodepath=lineaslist[1]
                gencodepath=stripnewlineEnd(gencodepath)
            elif lineaslist[0]=="hang1path:":
                hang1path=lineaslist[1]
                hang1path=stripnewlineEnd(hang1path)
            elif lineaslist[0]=="pseudpath:":
                pseudpath=lineaslist[1]
                pseudpath=stripnewlineEnd(pseudpath)
            else:
                pass
    if mRNApath=="" or gencodepath=="" or hang1path=="" or pseudpath=="":
        assert False, ("There was an error"
            +" parsing the configuration file: \n\tmRNA path was "
            +`mRNApath`+"\n\tgencodepath was "+`gencodepath`
            +"\n\thang1path was "+`hang1path`+"\n\tpseudpath was "+`pseudpath`)
    #Obtain all RefSeq NR and XR genes:
    refseqz=RefGene_parserII.returnRefGenelist(mRNApath,["NR","XR"])
    #Take everything from Gencode
    gencodez=GTFparser_general.makeSmallfeatures_fromGTF(gencodepath,True,"ALL")
    GTFparser_general.giveranger(gencodez)
    #Use Hangauer's extended protein coding gene structures
    assert "19" in hang1path, ("Error: file name did not contain 19--"
                    +"use the Hangauer S1 file that's been converted to hg19!")
    h1file=open(hang1path,'r')
    h1list=[]
    h1fails=open(confFiledirpath
                 +"/h1fails_from_notintergenic_in_categorizeSNPs.txt",'w')
    for line in h1file:
        try:
            lineaslist=line.rsplit("\t")
            chrom=int(lineaslist[0][3:]) #chop off the chr prefix
            startpoz=int(lineaslist[1])
            stoppos=int(lineaslist[2])
            h1list.append(FeatureClass_Small.SmallFeature("",".",chrom,
                                [startpoz],[stoppos],[1],[startpoz,stoppos]))
        except:
            h1fails.write(line) #for example, if the chromosome was 16_random,
            #casting to an int would fail, so the line would not contribute to
            #the "nonintergenic" regions and would get written to h1fails file
    h1fails.close()
    #Get Yale pseudogenes
    pseuds=PseudogeneParser.return_pseudogenes(pseudpath)
    #Extract the locations from all, and make unified location list by chrom
    allfeats=refseqz+gencodez+h1list+pseuds #TO DO add in Ensembl
    locilist=[[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],
        [],[],[]]
    for feat in allfeats:
        locilist[(feat.chromosome)-1].append(feat.ranger)
    #Now make simplest possible list of positions for each chromosome that
    #covers all the necessary bases
    chrcounter=1
    for chromo in locilist:
        print "cleaning up the locations for chromosome "+`chrcounter`
        _clean_up_locations(chromo)
        chrcounter+=1
    print "pickling the cleaned locations list"
    cPickle.dump(locilist,open(filename,'wb'),2)
    print "All Done :D"
Beispiel #4
0
def notintergenic(pathtoconfig, filename):
    #Tested in "SECOND TEST" as described in "10-15-13 testing categorizeSNPs
    #with fake files.docx"
    """Create a pickled Python list that describes 'notintergenic regions'
    
    ----------Function Output----------
    Creates a nested list of locationsin the same format as the output of
    _collectlocations() for regions that are going to be excluded from
    "intergenic" classification. The nested list is saved as a pickled Python
    list in the file "filename" in the same directory as the configuration file
    (specified by <pathtoconfig>) The choice of what to include in
    "notintergenic" is based on Hangauer paper. Within each chromosome the
    locations have been collapsed such that the minimum number of intervals
    is reported (to help speed up the process of determining whether a SNP falls
    in a 'notintergenic' region.)
    Regions included as "notintergenic":
        RefSeq NR genes (RNA) and XR genes (RNA predicted model)
        Gencode: everything in version 18
        Hangauer's extended protein coding gene structures derived from RNA-seq
        data (downloaded as S1)
        Pseudogenes from Yale
    Note: This is based on what Hangauer et al did but with a few differences,
    including: The "mappability" of the genome is not considered here
    (Hangauer only considered mappable regions); H-Invitational data is not
    used; alternative and extended 5' and 3' UTRs from UTRdb are not used;
    everything in Gencode was used instead of just "genes"
    Note: "the default human gene set in the Ensembl browser is therefore also
    the current version of GENCODE." Which is why it's weird to me that
    Hangauer mentions Gencode and Ensembl separately...?"""
    confFile = open(pathtoconfig, 'r')
    confFilepathaslist = pathtoconfig.rsplit(
        "/")  #should separator ever be "\\"?
    confFiledirpath = "/".join(confFilepathaslist[:-1])
    mRNApath = ""
    gencodepath = ""
    hang1path = ""
    pseudpath = ""
    for line in confFile:
        if line[0] != "#":
            lineaslist = line.rsplit(" ")
            if lineaslist[0] == "mRNApath:":
                mRNApath = lineaslist[1]
                mRNApath = stripnewlineEnd(mRNApath)
            elif lineaslist[0] == "gencodepath:":
                gencodepath = lineaslist[1]
                gencodepath = stripnewlineEnd(gencodepath)
            elif lineaslist[0] == "hang1path:":
                hang1path = lineaslist[1]
                hang1path = stripnewlineEnd(hang1path)
            elif lineaslist[0] == "pseudpath:":
                pseudpath = lineaslist[1]
                pseudpath = stripnewlineEnd(pseudpath)
            else:
                pass
    if mRNApath == "" or gencodepath == "" or hang1path == "" or pseudpath == "":
        assert False, ("There was an error" +
                       " parsing the configuration file: \n\tmRNA path was " +
                       ` mRNApath ` + "\n\tgencodepath was " +
                       ` gencodepath ` + "\n\thang1path was " + ` hang1path ` +
                       "\n\tpseudpath was " + ` pseudpath `)
    #Obtain all RefSeq NR and XR genes:
    refseqz = RefGene_parserII.returnRefGenelist(mRNApath, ["NR", "XR"])
    #Take everything from Gencode
    gencodez = GTFparser_general.makeSmallfeatures_fromGTF(
        gencodepath, True, "ALL")
    GTFparser_general.giveranger(gencodez)
    #Use Hangauer's extended protein coding gene structures
    assert "19" in hang1path, (
        "Error: file name did not contain 19--" +
        "use the Hangauer S1 file that's been converted to hg19!")
    h1file = open(hang1path, 'r')
    h1list = []
    h1fails = open(
        confFiledirpath + "/h1fails_from_notintergenic_in_categorizeSNPs.txt",
        'w')
    for line in h1file:
        try:
            lineaslist = line.rsplit("\t")
            chrom = int(lineaslist[0][3:])  #chop off the chr prefix
            startpoz = int(lineaslist[1])
            stoppos = int(lineaslist[2])
            h1list.append(
                FeatureClass_Small.SmallFeature("", ".", chrom, [startpoz],
                                                [stoppos], [1],
                                                [startpoz, stoppos]))
        except:
            h1fails.write(line)  #for example, if the chromosome was 16_random,
            #casting to an int would fail, so the line would not contribute to
            #the "nonintergenic" regions and would get written to h1fails file
    h1fails.close()
    #Get Yale pseudogenes
    pseuds = PseudogeneParser.return_pseudogenes(pseudpath)
    #Extract the locations from all, and make unified location list by chrom
    allfeats = refseqz + gencodez + h1list + pseuds  #TO DO add in Ensembl
    locilist = [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
                [], [], [], [], [], [], [], []]
    for feat in allfeats:
        locilist[(feat.chromosome) - 1].append(feat.ranger)
    #Now make simplest possible list of positions for each chromosome that
    #covers all the necessary bases
    chrcounter = 1
    for chromo in locilist:
        print "cleaning up the locations for chromosome " + ` chrcounter `
        _clean_up_locations(chromo)
        chrcounter += 1
    print "pickling the cleaned locations list"
    cPickle.dump(locilist, open(filename, 'wb'), 2)
    print "All Done :D"