Beispiel #1
0



# Read source data into list
bioMartList = srcFile.readlines()

# remove trailing '\n' from every record
LEN_bML = len(bioMartList)
i = 0
while i < LEN_bML:
    bioMartList[i] = bioMartList[i].rstrip('\n')
    i = i + 1

# Grouping records by gene name and splitting record fields into lists
groupedList = JamesDefs.groupByField(bioMartList, 0)

# Combine exon records into a single gene line record with start and stop coords for coding region
# TranscriptID field will be removed and fields representing the number of exons encountered and 
# the chromosomal coverage will be appended respectivly to the end of each record
oneLineRecordList = combineExons(groupedList, bdryLen)

# Write out oneLineRecordList to outFile
boundaryFile.writelines(oneLineRecordList)
boundaryFile.close()




print 'Tada!'
Beispiel #2
0
t1 = time()

#  Populate a Dict with Seq objs for Anopheles boundary seqs
#  What follows directly is a klugde to get my seqDict vals to have the IUPAC ambiguous alphabet
boundarySeqs = list(SeqIO.parse(open(boundarySeqs, "rU"), "fasta"))
for record in boundarySeqs :
    record.seq.alphabet = IUPACAmbiguousDNA

boundarySeqs = SeqIO.to_dict(boundarySeqs, key_function = lambda rec : rec.description.split()[0])

# convert iupac motifs to regexs and creat list of lists with each motif represented as ['IUPAC', 'REGEX'] 
convertMotifList(motifList)


#  group ClusterDefs by ClusterName
clusterDefinitionList = JamesDefs.groupByField(clusterDefinitionList, 0)

#  This will become a list of tab delim'd params for the hyperGeo func: 'Motif:ClusterID';'motifCountInAll';'len(all)';'motifCountInCluster';'numOfSeqsInCluster' 
hyperGeoParams_4_motifClusterPairs = []

m=0
for motif in motifList:
    m+=1
    print 'Motif '+str(m)
    #  Count how many seq in total list have motif in either orientation
    motifCountInAll = None
    motifCountInAll = countMotifInAll(motif[1], boundarySeqs)
 
    
    for cluster in clusterDefinitionList:
# open and create handle for outFile
resFile = open(outFile, 'w')
tick = time.clock()

# read file into list 
conflictList = conflictFile.readlines()
# remove trailing '\n' from every record
LEN_cL = len(conflictList)
i = 0
while i < LEN_cL:
    conflictList[i] = conflictList[i].rstrip('\n')
    i = i + 1


# group file by target gene id using groupByField 
fjoinOutByGeneIDList = JamesDefs.groupByField(conflictList, 1)

resolverArgs = {
                    'strandField' : 4,
                    'lowerBoundProximal' : 10,
                    'higherBoundProximal' : 11,
                    'conflictRegionStrt' : 18,
                    'conflictRegionEnd' : 19,
                    'whichBoundary':'upStream'
                }

resolvedBoundariesList = resolver(fjoinOutByGeneIDList, resolverArgs)

resFile.writelines(resolvedBoundariesList)

tock = time.clock()
#========================= User Defined Variables =========================

#  Path to original file
originalFastaDict = open('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas', 'rU')

desiredFastaList  = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Clusters.txt'

outDir            = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Fastas/'

#==========================================================================

desiredFastaList = map(lambda line : line.strip(), open(desiredFastaList, 'rU').readlines())

# Parse clusterDefs into list of clusters
listOfClusterDefs = JamesDefs.groupByField(desiredFastaList,0)




#  Instantiate the fasta rec lists with BioPython Seq using geneID field of discriptor as key to seq objects
originalFastaDict = SeqIO.to_dict(SeqIO.parse(originalFastaDict, 'fasta'),
                                    key_function = lambda rec : rec.description.split()[0])

for cluster in listOfClusterDefs:
    print "Working on Cluster: %s" % (cluster[0][0])
    #  New dict to catch copied seqObjs
    desiredFastaObjList = []
    
    for rec in cluster:
        if originalFastaDict.has_key(rec[1]):