args = parser.parse_args() entryNumber = args.entryNumber # # We assume that the hg19 coordinates and the GRCh37-lite coordinates # are sorted in the same sort order: by chrom and then chromStart. # There might be hg19 entries missing from the GRCh37 coordinates, # as some pieces of hg19 are not in GRCh37-lite. For each line in the # GRCh37-lite file, read to the corresponding entry in the hg19 file. # If we read to the end of the hg19 file without finding the # entry we're looking for, print out a big error message. hg19Fp = open(args.hg19Bed) grch37LiteFp = open(args.grch37LiteBed) for grch37LiteRow in grch37LiteFp: grch37LiteBed = Bed.Bed(grch37LiteRow.rstrip().split()) for hg19Row in hg19Fp: hg19Bed = Bed.Bed(hg19Row.rstrip().split()) if hg19Bed.name == grch37LiteBed.name: break if hg19Bed.name != grch37LiteBed.name: sys.exit("Error: missing entry for %s in the GRCh37-lite bed" % (hg19Bed.name)) gg = Grch37LiteGaf.GafDbSnp(grch37LiteBed, entryNumber=entryNumber) gg.featureInfo = basicSnpInfo(hg19Bed, args.snpTable, cursor) (gg.gene, gg.geneLocus) = mapSnpToLocus(grch37LiteBed, gg.featureInfo, cursor) entryNumber = entryNumber + 1 gg.entryNumber = entryNumber gg.write(sys.stdout) exit(0)
parser = argparse.ArgumentParser() parser.add_argument( 'inputPsl', type=str, help="PSL file from which selected records will be printed") parser.add_argument('inputBed', type=str, help="Bed file to be used for screening the PSL records") args = parser.parse_args() bedAlignments = dict() bedFp = open(args.inputBed) for line in bedFp: line = line.rstrip() bedAlign = Bed.Bed(line.split()) bedAlignments[bedAlign.name] = bedAlign bedFp.close() pslFp = open(args.inputPsl) readPastHeader = False for line in pslFp: line = line.rstrip() if re.search("^--", line): readPastHeader = True else: if readPastHeader: pslAlign = Psl.Psl(line.split()) if bedAlignments.has_key(pslAlign.qName): if bedIsSubsetOfPsl(bedAlignments[pslAlign.qName], pslAlign): print line
idToLabel = dict() gffIter = GFF.parse(args.inputGff) for chrom in gffIter: for hit in chrom.features: id = hit.id label = "%s|%s" % (hit.qualifiers["Name"][0], hit.qualifiers["ID"][0]) idToLabel[id] = label if hit.type == "miRNA": miRnaToPreMiRna[hit.id] = hit.qualifiers["derives_from"][0] # # Read the bed file containing the GRCh37-lite coordinates. # While converting each line to GAF format, replace the ID # with the miRNA name, look up the name of the pre-miRNA that # the miRNA is derived from, and note that in the featureInfo field. miRnaBedFp = open(args.miRnaBed) for line in miRnaBedFp: bb = Bed.Bed(line.rstrip().split()) gg = Grch37LiteGaf.GafMiRna(bb) gg.featureId = idToLabel[gg.featureId] preMiRnaId = miRnaToPreMiRna[bb.name] preMiRnaName = idToLabel[preMiRnaId] gg.featureInfo = "pre-miRNA=%s" % (preMiRnaName) (geneName, geneLocus) = preMiRnaGeneLookup(preMiRnaName, cursor) gg.gene = geneName gg.geneLocus = geneLocus entryNumber = entryNumber + 1 gg.entryNumber = entryNumber gg.write(sys.stdout) exit(0)
help="Probe BED file, GRCh37-lite coordinates") parser.add_argument("-n", dest="entryNumber", help="Initial entry number", default=0) parser.add_argument("-d", dest="debug", help="Optional debugging info", default=False) args = parser.parse_args() entryNumber = args.entryNumber fp = open(args.grch37LiteBed) for line in fp: line = line.rstrip() grch37LiteBed = Bed.Bed(line.split()) gg = Grch37LiteGaf.GafMaProbe(grch37LiteBed, entryNumber) # # Look up any overlapping genes. If multiple overlapping genes are found, # parse the genes and gene locus strings into semicolon-delimited lists. geneXrefQuery = """SELECT DISTINCT geneName, grch37LiteLocus FROM gafGeneXref WHERE grch37LiteChrom = '%s' AND grch37LiteChromStart <= %s AND grch37LiteChromEnd >= %s and grch37LiteStrand = '%s'""" % ( grch37LiteBed.chrom, grch37LiteBed.chromEnd, grch37LiteBed.chromStart, grch37LiteBed.strand) cursor.execute(geneXrefQuery) if (args.debug): print "executing query", geneXrefQuery
parser.add_argument("-n", dest="entryNumber", help="Initial entry number", default=0) args = parser.parse_args() db = MySQLdb.connect(host="localhost", db="hg19", read_default_file="~/.my.cnf") cursor = db.cursor(MySQLdb.cursors.DictCursor) entryNumber = args.entryNumber fp = open(args.inputBed) for line in fp: line = line.rstrip() bb = Bed.Bed(line.split()) tokens = bb.name.split(";") clusterId = tokens.pop() bb.name = ";".join(tokens) gg = Grch37LiteGaf.GafGene(bb, entryNumber, True) geneXrefQuery = """SELECT geneName, grch37LiteLocus FROM gafGeneXref WHERE clusterId = '%s'""" % (clusterId) cursor.execute(geneXrefQuery) if cursor.rowcount == 1: row = cursor.fetchone() gg.gene = row["geneName"] gg.featureId = row["geneName"] gg.geneLocus = row["grch37LiteLocus"] entryNumber = entryNumber + 1 gg.write(sys.stdout) exit(entryNumber)
type=str, help="Input bed file in GRCh37-lite coordinates") args = parser.parse_args() db = MySQLdb.connect(host="localhost", db="hg19", read_default_file="~/.my.cnf") cursor = db.cursor(MySQLdb.cursors.DictCursor) # # First, make a dictionary of the hg19 coordinates of each gene hg19Coordinates = dict() hg19Fp = open(args.hg19Bed) for line in hg19Fp: line = line.rstrip() bb = Bed.Bed(line.split()) hg19Coordinates[bb.name] = bb hg19Fp.close # # Next, read through each entry in GRCh37-lite coordinates. # At each entry, look up the hg19 coordinates, which will be # used later for comparison with other tables in hg19. Also at # each entry, separate the gene name and cluster ID, build a # GRCh37-lite locus string, and store the gene name, cluster ID, # gene locus string, and hg19 coordinates within separate columns. lineCount = 0 grch37LiteFp = open(args.grch37LiteBed) for line in grch37LiteFp: line = line.rstrip() grch37LiteCoords = Bed.Bed(line.split())