Beispiel #1
0
def _loadchrarray(chrmdex1,bases1,lincpath1,mrnapath1,nonintpath1,configpath1,
                  whether_to_use_real_sizes1):
    """Return the chrarray called "chromosome<chrmdex1+1>_ext<bases1>_array.bin"
    (for example, "chromosome1_ext0_array.bin").
    The chrarray should be located in the same directory as the configuration file.
    If the chrarray file already exists, it is loaded and returned.
    If the chrarray file does not exist, it is created using
    chromoarray.create_22chrom_arrays() for the
    correct chrmdex, and saved in the same directory as the configuration file.
    The creation of the chrarray file requires the nonintergenic regions to be
    defined. If they have been defined, they are loaded and used. If they have
    not been defined, then they are defined before being used."""
    confFilepathaslist=configpath1.rsplit("/") #should separator ever be "\\"?)
    confFiledirpath="/".join(confFilepathaslist[:-1])
    try:
        chrarray=cPickle.load(open(confFiledirpath+"/chromosome"
                +`chrmdex1+1`+"_ext"+`bases1`+"_array.bin",'rb'))
                #load the pre-created array for the appropriate chromosome
                #and bases1 parameter
    except IOError as myerr:
        if "No such file or directory" in myerr.strerror: #if you haven't
            #created the chromosome arrays yet, make them now
            lincRNAslist=cPickle.load(open(lincpath1,'rb'))
            linkLocations=_collectlocations(lincRNAslist)
            for chromzome in linkLocations:
                _clean_up_locations(chromzome) #If you don't clean up the
                #locations, then you have overlapping intervals, and it's just
                #cleaner to NOT have overlapping intervals
            mRNAslist=GTFparser_general.sortSmallFeatsbychrom(
                RefGene_parserII.returnRefGenelist(mrnapath1,["NM"]))
            mrnaLocations=_collectlocations(mRNAslist)
            for chromzome2 in mrnaLocations:
                _clean_up_locations(chromzome2)
            try: #similar setup for the nonintLocations file: create it if it
                #doesn't exist
               nonintLocations=cPickle.load(open(nonintpath1,'rb'))
            except IOError as myerr2:
                if "No such file or directory" in myerr2.strerror:
                    notintergenic(configpath1,nonintpath1)#name the file
                    #whatever is specified
                    #e.g. categorizeSNPs_NOTintergenic_locslist_<suffix>.bin
                    nonintLocations=cPickle.load(open(nonintpath1,'rb'))
                else:
                    raise IOError
            chromoarray.create_22chrom_arrays(mrnaLocations,linkLocations,
                                nonintLocations,bases1,[chrmdex1+1],
                                whether_to_use_real_sizes1,
                                dirtosavein=confFiledirpath)
            chrarray=cPickle.load(open(confFiledirpath+"/chromosome"
                            +`chrmdex1+1`+"_ext"+`bases1`+"_array.bin",'rb'))
        else:
            raise IOError #any IOError BESIDES not having created the chromosome
            #arrays will get reported
    return chrarray
Beispiel #2
0
def notintergenic(pathtoconfig,filename):
    #Tested in "SECOND TEST" as described in "10-15-13 testing categorizeSNPs
    #with fake files.docx"
    """Create a pickled Python list that describes 'notintergenic regions'
    
    ----------Function Output----------
    Creates a nested list of locationsin the same format as the output of
    _collectlocations() for regions that are going to be excluded from
    "intergenic" classification. The nested list is saved as a pickled Python
    list in the file "filename" in the same directory as the configuration file
    (specified by <pathtoconfig>) The choice of what to include in
    "notintergenic" is based on Hangauer paper. Within each chromosome the
    locations have been collapsed such that the minimum number of intervals
    is reported (to help speed up the process of determining whether a SNP falls
    in a 'notintergenic' region.)
    Regions included as "notintergenic":
        RefSeq NR genes (RNA) and XR genes (RNA predicted model)
        Gencode: everything in version 18
        Hangauer's extended protein coding gene structures derived from RNA-seq
        data (downloaded as S1)
        Pseudogenes from Yale
    Note: This is based on what Hangauer et al did but with a few differences,
    including: The "mappability" of the genome is not considered here
    (Hangauer only considered mappable regions); H-Invitational data is not
    used; alternative and extended 5' and 3' UTRs from UTRdb are not used;
    everything in Gencode was used instead of just "genes"
    Note: "the default human gene set in the Ensembl browser is therefore also
    the current version of GENCODE." Which is why it's weird to me that
    Hangauer mentions Gencode and Ensembl separately...?"""
    confFile=open(pathtoconfig,'r')
    confFilepathaslist=pathtoconfig.rsplit("/") #should separator ever be "\\"?
    confFiledirpath="/".join(confFilepathaslist[:-1])
    mRNApath=""; gencodepath=""; hang1path=""; pseudpath=""
    for line in confFile:
        if line[0]!="#":
            lineaslist=line.rsplit(" ")
            if lineaslist[0]=="mRNApath:":
                mRNApath=lineaslist[1]
                mRNApath=stripnewlineEnd(mRNApath)
            elif lineaslist[0]=="gencodepath:":
                gencodepath=lineaslist[1]
                gencodepath=stripnewlineEnd(gencodepath)
            elif lineaslist[0]=="hang1path:":
                hang1path=lineaslist[1]
                hang1path=stripnewlineEnd(hang1path)
            elif lineaslist[0]=="pseudpath:":
                pseudpath=lineaslist[1]
                pseudpath=stripnewlineEnd(pseudpath)
            else:
                pass
    if mRNApath=="" or gencodepath=="" or hang1path=="" or pseudpath=="":
        assert False, ("There was an error"
            +" parsing the configuration file: \n\tmRNA path was "
            +`mRNApath`+"\n\tgencodepath was "+`gencodepath`
            +"\n\thang1path was "+`hang1path`+"\n\tpseudpath was "+`pseudpath`)
    #Obtain all RefSeq NR and XR genes:
    refseqz=RefGene_parserII.returnRefGenelist(mRNApath,["NR","XR"])
    #Take everything from Gencode
    gencodez=GTFparser_general.makeSmallfeatures_fromGTF(gencodepath,True,"ALL")
    GTFparser_general.giveranger(gencodez)
    #Use Hangauer's extended protein coding gene structures
    assert "19" in hang1path, ("Error: file name did not contain 19--"
                    +"use the Hangauer S1 file that's been converted to hg19!")
    h1file=open(hang1path,'r')
    h1list=[]
    h1fails=open(confFiledirpath
                 +"/h1fails_from_notintergenic_in_categorizeSNPs.txt",'w')
    for line in h1file:
        try:
            lineaslist=line.rsplit("\t")
            chrom=int(lineaslist[0][3:]) #chop off the chr prefix
            startpoz=int(lineaslist[1])
            stoppos=int(lineaslist[2])
            h1list.append(FeatureClass_Small.SmallFeature("",".",chrom,
                                [startpoz],[stoppos],[1],[startpoz,stoppos]))
        except:
            h1fails.write(line) #for example, if the chromosome was 16_random,
            #casting to an int would fail, so the line would not contribute to
            #the "nonintergenic" regions and would get written to h1fails file
    h1fails.close()
    #Get Yale pseudogenes
    pseuds=PseudogeneParser.return_pseudogenes(pseudpath)
    #Extract the locations from all, and make unified location list by chrom
    allfeats=refseqz+gencodez+h1list+pseuds #TO DO add in Ensembl
    locilist=[[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],
        [],[],[]]
    for feat in allfeats:
        locilist[(feat.chromosome)-1].append(feat.ranger)
    #Now make simplest possible list of positions for each chromosome that
    #covers all the necessary bases
    chrcounter=1
    for chromo in locilist:
        print "cleaning up the locations for chromosome "+`chrcounter`
        _clean_up_locations(chromo)
        chrcounter+=1
    print "pickling the cleaned locations list"
    cPickle.dump(locilist,open(filename,'wb'),2)
    print "All Done :D"
Beispiel #3
0
def notintergenic(pathtoconfig, filename):
    #Tested in "SECOND TEST" as described in "10-15-13 testing categorizeSNPs
    #with fake files.docx"
    """Create a pickled Python list that describes 'notintergenic regions'
    
    ----------Function Output----------
    Creates a nested list of locationsin the same format as the output of
    _collectlocations() for regions that are going to be excluded from
    "intergenic" classification. The nested list is saved as a pickled Python
    list in the file "filename" in the same directory as the configuration file
    (specified by <pathtoconfig>) The choice of what to include in
    "notintergenic" is based on Hangauer paper. Within each chromosome the
    locations have been collapsed such that the minimum number of intervals
    is reported (to help speed up the process of determining whether a SNP falls
    in a 'notintergenic' region.)
    Regions included as "notintergenic":
        RefSeq NR genes (RNA) and XR genes (RNA predicted model)
        Gencode: everything in version 18
        Hangauer's extended protein coding gene structures derived from RNA-seq
        data (downloaded as S1)
        Pseudogenes from Yale
    Note: This is based on what Hangauer et al did but with a few differences,
    including: The "mappability" of the genome is not considered here
    (Hangauer only considered mappable regions); H-Invitational data is not
    used; alternative and extended 5' and 3' UTRs from UTRdb are not used;
    everything in Gencode was used instead of just "genes"
    Note: "the default human gene set in the Ensembl browser is therefore also
    the current version of GENCODE." Which is why it's weird to me that
    Hangauer mentions Gencode and Ensembl separately...?"""
    confFile = open(pathtoconfig, 'r')
    confFilepathaslist = pathtoconfig.rsplit(
        "/")  #should separator ever be "\\"?
    confFiledirpath = "/".join(confFilepathaslist[:-1])
    mRNApath = ""
    gencodepath = ""
    hang1path = ""
    pseudpath = ""
    for line in confFile:
        if line[0] != "#":
            lineaslist = line.rsplit(" ")
            if lineaslist[0] == "mRNApath:":
                mRNApath = lineaslist[1]
                mRNApath = stripnewlineEnd(mRNApath)
            elif lineaslist[0] == "gencodepath:":
                gencodepath = lineaslist[1]
                gencodepath = stripnewlineEnd(gencodepath)
            elif lineaslist[0] == "hang1path:":
                hang1path = lineaslist[1]
                hang1path = stripnewlineEnd(hang1path)
            elif lineaslist[0] == "pseudpath:":
                pseudpath = lineaslist[1]
                pseudpath = stripnewlineEnd(pseudpath)
            else:
                pass
    if mRNApath == "" or gencodepath == "" or hang1path == "" or pseudpath == "":
        assert False, ("There was an error" +
                       " parsing the configuration file: \n\tmRNA path was " +
                       ` mRNApath ` + "\n\tgencodepath was " +
                       ` gencodepath ` + "\n\thang1path was " + ` hang1path ` +
                       "\n\tpseudpath was " + ` pseudpath `)
    #Obtain all RefSeq NR and XR genes:
    refseqz = RefGene_parserII.returnRefGenelist(mRNApath, ["NR", "XR"])
    #Take everything from Gencode
    gencodez = GTFparser_general.makeSmallfeatures_fromGTF(
        gencodepath, True, "ALL")
    GTFparser_general.giveranger(gencodez)
    #Use Hangauer's extended protein coding gene structures
    assert "19" in hang1path, (
        "Error: file name did not contain 19--" +
        "use the Hangauer S1 file that's been converted to hg19!")
    h1file = open(hang1path, 'r')
    h1list = []
    h1fails = open(
        confFiledirpath + "/h1fails_from_notintergenic_in_categorizeSNPs.txt",
        'w')
    for line in h1file:
        try:
            lineaslist = line.rsplit("\t")
            chrom = int(lineaslist[0][3:])  #chop off the chr prefix
            startpoz = int(lineaslist[1])
            stoppos = int(lineaslist[2])
            h1list.append(
                FeatureClass_Small.SmallFeature("", ".", chrom, [startpoz],
                                                [stoppos], [1],
                                                [startpoz, stoppos]))
        except:
            h1fails.write(line)  #for example, if the chromosome was 16_random,
            #casting to an int would fail, so the line would not contribute to
            #the "nonintergenic" regions and would get written to h1fails file
    h1fails.close()
    #Get Yale pseudogenes
    pseuds = PseudogeneParser.return_pseudogenes(pseudpath)
    #Extract the locations from all, and make unified location list by chrom
    allfeats = refseqz + gencodez + h1list + pseuds  #TO DO add in Ensembl
    locilist = [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
                [], [], [], [], [], [], [], []]
    for feat in allfeats:
        locilist[(feat.chromosome) - 1].append(feat.ranger)
    #Now make simplest possible list of positions for each chromosome that
    #covers all the necessary bases
    chrcounter = 1
    for chromo in locilist:
        print "cleaning up the locations for chromosome " + ` chrcounter `
        _clean_up_locations(chromo)
        chrcounter += 1
    print "pickling the cleaned locations list"
    cPickle.dump(locilist, open(filename, 'wb'), 2)
    print "All Done :D"
Beispiel #4
0
def _loadchrarray(chrmdex1, bases1, lincpath1, mrnapath1, nonintpath1,
                  configpath1, whether_to_use_real_sizes1):
    """Return the chrarray called "chromosome<chrmdex1+1>_ext<bases1>_array.bin"
    (for example, "chromosome1_ext0_array.bin").
    The chrarray should be located in the same directory as the configuration file.
    If the chrarray file already exists, it is loaded and returned.
    If the chrarray file does not exist, it is created using
    chromoarray.create_22chrom_arrays() for the
    correct chrmdex, and saved in the same directory as the configuration file.
    The creation of the chrarray file requires the nonintergenic regions to be
    defined. If they have been defined, they are loaded and used. If they have
    not been defined, then they are defined before being used."""
    confFilepathaslist = configpath1.rsplit(
        "/")  #should separator ever be "\\"?)
    confFiledirpath = "/".join(confFilepathaslist[:-1])
    try:
        chrarray = cPickle.load(
            open(
                confFiledirpath + "/chromosome" + ` chrmdex1 + 1 ` + "_ext" +
                ` bases1 ` + "_array.bin", 'rb'))
        #load the pre-created array for the appropriate chromosome
        #and bases1 parameter
    except IOError as myerr:
        if "No such file or directory" in myerr.strerror:  #if you haven't
            #created the chromosome arrays yet, make them now
            lincRNAslist = cPickle.load(open(lincpath1, 'rb'))
            linkLocations = _collectlocations(lincRNAslist)
            for chromzome in linkLocations:
                _clean_up_locations(chromzome)  #If you don't clean up the
                #locations, then you have overlapping intervals, and it's just
                #cleaner to NOT have overlapping intervals
            mRNAslist = GTFparser_general.sortSmallFeatsbychrom(
                RefGene_parserII.returnRefGenelist(mrnapath1, ["NM"]))
            mrnaLocations = _collectlocations(mRNAslist)
            for chromzome2 in mrnaLocations:
                _clean_up_locations(chromzome2)
            try:  #similar setup for the nonintLocations file: create it if it
                #doesn't exist
                nonintLocations = cPickle.load(open(nonintpath1, 'rb'))
            except IOError as myerr2:
                if "No such file or directory" in myerr2.strerror:
                    notintergenic(configpath1, nonintpath1)  #name the file
                    #whatever is specified
                    #e.g. categorizeSNPs_NOTintergenic_locslist_<suffix>.bin
                    nonintLocations = cPickle.load(open(nonintpath1, 'rb'))
                else:
                    raise IOError
            chromoarray.create_22chrom_arrays(mrnaLocations,
                                              linkLocations,
                                              nonintLocations,
                                              bases1, [chrmdex1 + 1],
                                              whether_to_use_real_sizes1,
                                              dirtosavein=confFiledirpath)
            chrarray = cPickle.load(
                open(
                    confFiledirpath + "/chromosome" + ` chrmdex1 + 1 ` +
                    "_ext" + ` bases1 ` + "_array.bin", 'rb'))
        else:
            raise IOError  #any IOError BESIDES not having created the chromosome
            #arrays will get reported
    return chrarray