def updateExcludedRegions(path,tfName,window): """mark motifs if it overlaps ENCODE excluded regions""" for infile in glob.glob(os.path.join(path,"*.bed.gz")): (regpath,regfilename) = os.path.split(infile) expName = regfilename.split('.')[0] with gzip.open(infile,'rt') as bedFile: bed = csv.reader(bedFile, delimiter = '\t') annoIntvlDict = countBed.getBed6Anno(bed,expName) intervalDict = countBed.sortInterval(annoIntvlDict) cursor = mcollection.find({"tf_name": tfName}) for test in cursor: motifChrom, motifStart, motifEnd = test["motif_genomic_regions_info"]["chr"], \ test["motif_genomic_regions_info"]["start"], \ test["motif_genomic_regions_info"]["end"] regionList, valueList = countBed.getMotifAnno(annoIntvlDict, intervalDict,motifChrom,motifStart,motifEnd,window) if regionList != []: print regionList, valueList, motifChrom, motifStart, motifEnd mcollection.update({"_id": test["_id"]}, {"$set": {"motif_mapability_info":{"exclude": regionList}}}, upsert = True) return 0
def main(argv): if len(argv) < 3: sys.stderr.write("Usage: %s motif_tf_info_file path-to-fimo-output\n" % argv[0]) return 1 if not os.path.isfile(argv[1]): sys.stderr.write('Error: motif_info_file %r was not found!\n' % argv[1]) return 1 if not os.path.exists(argv[2]): sys.stderr.write('Error: path-to-fimo-output %r was not found!\n' % argv[2]) return 1 server = 'localhost' port = 27017 client = MongoClient(server, port) c = Connection() db = c["mm9"] db = client["mm9"] global mcollection mcollection = db["motif_instance_hughes_test"] ##drop collection #c["mm9"].drop_collection("motif_instance_hughes_test") #mcollection.remove() #db.drop_collection('motif_instance_hughes_test') #mcollection = db["motif_instance_hughes_test"] #print 'clean', mcollection.count() #index collections # mcollection.ensure_index("motif_id",name="m_id",unique=False,background=True) # mcollection.ensure_index("tf_name",name="tf_name",unique=False,background=True) #collection.ensure_index("motif_type",name="motif_type",unique=True,drop_dups=True,background=True) # mcollection.ensure_index("motif_gene_mapping_info.genelist10kb",name = "target_gene",unique=False,background=True) # mcollection.ensure_index("motif_gene_mapping_info.closest_gene",name = "closest_gene",unique=False,background=True) # mcollection.ensure_index("motif_score", name = "motif_score", unique = False, background = True) # mcollection.ensure_index("motif_tf_info.motif_type", name = "motif_type", unique = False, background = True) # mcollection.ensure_index("motif_tf_info.msource_type", name = "msource_type", unique = False, background = True) # mcollection.ensure_index("motif_tf_info.tf_status", name = "tf_status", unique = False, background = True) # mcollection.ensure_index("motif_tf_info.msource_id", name = "project_name", unique = False, background = True) # mcollection.ensure_index("motif_tf_info.family_name", name = "family_name", unique = False, background = True) # mcollection.ensure_index([("tf_name", ASCENDING), # ("motif_gene_mapping_info.genelist10kb", DESCENDING)], # name="network_edge", unique=False, background=True) #index genomic regions #mcollection.ensure_index([("motif_genomic_regions_info.chr",DESCENDING), # ("motif_genomic_regions_info.start",DESCENDING), #("motif_genomic_regions_info.end",ASCENDING), # ("motif_genomic_regions_info.strand",DESCENDING)], # name="genomic_regions",unique=False,background=True) #print collection #print 'done indexing' infile = sys.argv[1]#'/home/xc406/data/hg19motifs90/TF_Information90hg19.txt' path = sys.argv[2] ifile = open(infile,'rt') tf_info = csv.reader(ifile, delimiter = '\t') #mlist = [] for row in tf_info: try: dbd_count = int(row[11]) except ValueError: dbd_count = None try: msource_year = int(row[18]) except ValueError: msource_year = None #print row motif_instance = { "motif_id": row[3], "tf_name": row[6], "motif_score": None, "motif_tf_info":{ "species_name": row[7], "tf_status": row[8], #direct or indirect "family_name": row[9], "dbds": row[10], "dbd_count": dbd_count, "dbid": row[12], "motif_type": row[14], "msource_id": row[15], "msource_type": row[16], "msource_author": row[17], "msource_year": msource_year, "pmid": row[19] #citation }, "motif_genomic_regions_info":{ "chr": None, "start": None, "end": None, "strand": None }, "motif_mapability_info":{ "exclude": [], "score": [], "gc_content": None }, "motif_cons_info":{ "phylop_euarchontoglires": None, "phylop_mammals": None, "phylop_vertebrate": None, "phastCons_euarchontoglires": None, "phastCons_mammals": None, "phastCons_vertebrate": None #"SNP_diversity": None, #"Indel_diversity": None }, "motif_gene_mapping_info":{ "closest_gene": None, "feature": None,#intergenic or 3' 5' "dist_tss": None, "genelist10kb": [],##list of gene--center of motif fall in gene plus and minus 10kb "transcriptidlist10kb": [] #"epu_id": #boolean }, "motif_ct_info":{ "ct_name": None, "ct_type": None, #normal/cancerous/cellline/primarycell "accessibility_score": {}, #{type: log likelihood score} #"accessibility_type": [], #dhs, dgf, faire "chip_score": {}, #tf: pval(overlapping peaks) "h3k4me3_score": None, "h3k4me1_score": None, "h3k27ac_score": None, "p300_score": None, "pol2_score": None }, } #mlist.append(motif_instance) #print len(mlist) # try: #print motif_instance # mcollection.insert(motif_instance) # del motif_instance #print 'inserted' #collection # except DuplicateKeyError: #print 'dup' # pass #cursor = mcollection.find() print 'before entering genomic region info ', mcollection.count() #c = iter(cursor) ##clean overlapping motif entries startTime = time.clock() #updateMotifGenomicRegions(fimopath) #getCount(wigpath,"Hes5") ##write gff #cursor = mcollection.find({"tf_name": "Stat3"}) #print 'updated count ', cursor.count(), 'total count after update', mcollection.count(), 'update time', time.time() - startTime #ofile = open('/home/xc406/data/mongodbtest/test.gff','wt') #gffWriter = csv.writer(ofile, delimiter='\t') # makeGff(cursor,gffWriter,0) ##update gene features refSeqFile = open('/home/xc406/data/mm9_refseq_June_2014.txt','rt') refSeqReader = csv.reader(refSeqFile, delimiter='\t') tssDict, geneNameDict, geneRangeDict = getRefSeqDict(refSeqReader) cursor = mcollection.find({"tf_name":"Zscan4"}) intervalDict = countBed.sortInterval(geneRangeDict) startTime2 = time.clock() #with client.start_request():##open update #print "Zic1 motif: {0}".format(test) for test in cursor: motifChrom, motifStart, motifEnd = test["motif_genomic_regions_info"]["chr"], test["motif_genomic_regions_info"]["start"], test["motif_genomic_regions_info"]["end"] #print closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[0][0] #print closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[1] #print test["motif_id"], test["motif_genomic_regions_info"]["chr"],test["motif_genomic_regions_info"]["start"] #startTime = time.time() t = getTargetGene(geneRangeDict,intervalDict,motifChrom, motifStart, motifEnd, 10000) #endTime1 = time.time() #print "all target mapping", t, motifChrom, motifStart, motifEnd, endTime1 - startTime # closest = closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd) #endTime2 = time.time() #print "closest gene mapping time", closest, endTime2 - endTime1 # mcollection.update({"motif_id": test["motif_id"], "motif_genomic_regions_info":{"chr": test["motif_genomic_regions_info"]["chr"], # "start": test["motif_genomic_regions_info"]["start"]}},{"$set": #mcollection.update(test,{"$set":{"genomic_regions_gene_mapping":{"closest_gene": closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[0][0], # "dist_tss": closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[1]}}}) #if len(closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[0]) > 1: #print closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[0], motifChrom, motifStart, motifEnd # test["motif_gene_mapping_info"]["closest_gene"] = (closest[0],closest[2])#closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[0][0] # test["motif_gene_mapping_info"]["dist_tss"] = closest[1]#closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[1] test["genomic_regions_gene_mapping"]["genelist10kb"] = t[0] # test["motif_gene_mapping_info"]["transcriptidlist10kb"] = t[1]#getTargetGene(geneRangeDict,geneNameDict,motifChrom, motifStart, motifEnd, 0) mcollection.save(test) #print "Hes5 motif: {0}".format(test) #updateCons(path,"Hes5") #updateExcludedRegions(path,"Hes5",0) #updateMap(path, "Hes5", 0) #print 'update time', time.time() - startTime2 #cursor = mcollection.find({"tf_name":"Zic1"}) #makeGff(cursor,gffWriter,0) #ofile.close() #testupdate = mcollection.find_one({"tf_name":"Zic1"})#{"motif_id": test["motif_id"], #"motif_genomic_regions_info":{"chr": test["motif_genomic_regions_info"]["chr"], "start": test["motif_genomic_regions_info"]["start"]}}) #print "Zic1 motif: {0}".format(test) #print "Zic1 motif update: {0}".format(testupdate) print 'total time', time.clock() - startTime