Esempio n. 1
0
args = parser.parse_args()

entryNumber = args.entryNumber

#
# We assume that the hg19 coordinates and the GRCh37-lite coordinates
# are sorted in the same sort order: by chrom and then chromStart.
# There might be hg19 entries missing from the GRCh37 coordinates,
# as some pieces of hg19 are not in GRCh37-lite.  For each line in the
# GRCh37-lite file, read to the corresponding entry in the hg19 file.
# If we read to the end of the hg19 file without finding the
# entry we're looking for, print out a big error message.
hg19Fp = open(args.hg19Bed)
grch37LiteFp = open(args.grch37LiteBed)
for grch37LiteRow in grch37LiteFp:
    grch37LiteBed = Bed.Bed(grch37LiteRow.rstrip().split())
    for hg19Row in hg19Fp:
        hg19Bed = Bed.Bed(hg19Row.rstrip().split())
        if hg19Bed.name == grch37LiteBed.name:
            break
    if hg19Bed.name != grch37LiteBed.name:
        sys.exit("Error: missing entry for %s in the GRCh37-lite bed" %
                 (hg19Bed.name))
    gg = Grch37LiteGaf.GafDbSnp(grch37LiteBed, entryNumber=entryNumber)
    gg.featureInfo = basicSnpInfo(hg19Bed, args.snpTable, cursor)
    (gg.gene, gg.geneLocus) = mapSnpToLocus(grch37LiteBed, gg.featureInfo,
                                            cursor)
    entryNumber = entryNumber + 1
    gg.entryNumber = entryNumber
    gg.write(sys.stdout)
exit(0)
parser = argparse.ArgumentParser()
parser.add_argument(
    'inputPsl',
    type=str,
    help="PSL file from which selected records will be printed")
parser.add_argument('inputBed',
                    type=str,
                    help="Bed file to be used for screening the PSL records")
args = parser.parse_args()

bedAlignments = dict()
bedFp = open(args.inputBed)
for line in bedFp:
    line = line.rstrip()
    bedAlign = Bed.Bed(line.split())
    bedAlignments[bedAlign.name] = bedAlign
bedFp.close()
pslFp = open(args.inputPsl)
readPastHeader = False
for line in pslFp:
    line = line.rstrip()
    if re.search("^--", line):
        readPastHeader = True
    else:
        if readPastHeader:
            pslAlign = Psl.Psl(line.split())
            if bedAlignments.has_key(pslAlign.qName):
                if bedIsSubsetOfPsl(bedAlignments[pslAlign.qName], pslAlign):
                    print line
Esempio n. 3
0
idToLabel = dict()
gffIter = GFF.parse(args.inputGff)
for chrom in gffIter:
    for hit in chrom.features:
        id = hit.id
        label = "%s|%s" % (hit.qualifiers["Name"][0], hit.qualifiers["ID"][0])
        idToLabel[id] = label
        if hit.type == "miRNA":
            miRnaToPreMiRna[hit.id] = hit.qualifiers["derives_from"][0]

#
# Read the bed file containing the GRCh37-lite coordinates.
# While converting each line to GAF format, replace the ID
# with the miRNA name, look up the name of the pre-miRNA that
# the miRNA is derived from, and note that in the featureInfo field.
miRnaBedFp = open(args.miRnaBed)
for line in miRnaBedFp:
    bb = Bed.Bed(line.rstrip().split())
    gg = Grch37LiteGaf.GafMiRna(bb)
    gg.featureId = idToLabel[gg.featureId]
    preMiRnaId = miRnaToPreMiRna[bb.name]
    preMiRnaName = idToLabel[preMiRnaId]
    gg.featureInfo = "pre-miRNA=%s" % (preMiRnaName)
    (geneName, geneLocus) = preMiRnaGeneLookup(preMiRnaName, cursor)
    gg.gene = geneName
    gg.geneLocus = geneLocus
    entryNumber = entryNumber + 1
    gg.entryNumber = entryNumber
    gg.write(sys.stdout)
exit(0)
Esempio n. 4
0
                    help="Probe BED file, GRCh37-lite coordinates")
parser.add_argument("-n",
                    dest="entryNumber",
                    help="Initial entry number",
                    default=0)
parser.add_argument("-d",
                    dest="debug",
                    help="Optional debugging info",
                    default=False)
args = parser.parse_args()

entryNumber = args.entryNumber
fp = open(args.grch37LiteBed)
for line in fp:
    line = line.rstrip()
    grch37LiteBed = Bed.Bed(line.split())
    gg = Grch37LiteGaf.GafMaProbe(grch37LiteBed, entryNumber)

    #
    # Look up any overlapping genes.  If multiple overlapping genes are found,
    # parse the genes and gene locus strings into semicolon-delimited lists.
    geneXrefQuery = """SELECT DISTINCT geneName, grch37LiteLocus
                          FROM gafGeneXref
                         WHERE grch37LiteChrom = '%s' AND grch37LiteChromStart <= %s
                           AND grch37LiteChromEnd >= %s
                           and grch37LiteStrand = '%s'""" % (
        grch37LiteBed.chrom, grch37LiteBed.chromEnd, grch37LiteBed.chromStart,
        grch37LiteBed.strand)
    cursor.execute(geneXrefQuery)
    if (args.debug):
        print "executing query", geneXrefQuery
Esempio n. 5
0
parser.add_argument("-n",
                    dest="entryNumber",
                    help="Initial entry number",
                    default=0)
args = parser.parse_args()

db = MySQLdb.connect(host="localhost",
                     db="hg19",
                     read_default_file="~/.my.cnf")
cursor = db.cursor(MySQLdb.cursors.DictCursor)

entryNumber = args.entryNumber
fp = open(args.inputBed)
for line in fp:
    line = line.rstrip()
    bb = Bed.Bed(line.split())
    tokens = bb.name.split(";")
    clusterId = tokens.pop()
    bb.name = ";".join(tokens)
    gg = Grch37LiteGaf.GafGene(bb, entryNumber, True)
    geneXrefQuery = """SELECT geneName, grch37LiteLocus FROM gafGeneXref
                        WHERE clusterId = '%s'""" % (clusterId)
    cursor.execute(geneXrefQuery)
    if cursor.rowcount == 1:
        row = cursor.fetchone()
        gg.gene = row["geneName"]
        gg.featureId = row["geneName"]
        gg.geneLocus = row["grch37LiteLocus"]
        entryNumber = entryNumber + 1
        gg.write(sys.stdout)
exit(entryNumber)
Esempio n. 6
0
                    type=str,
                    help="Input bed file in GRCh37-lite coordinates")
args = parser.parse_args()

db = MySQLdb.connect(host="localhost",
                     db="hg19",
                     read_default_file="~/.my.cnf")
cursor = db.cursor(MySQLdb.cursors.DictCursor)

#
# First, make a dictionary of the hg19 coordinates of each gene
hg19Coordinates = dict()
hg19Fp = open(args.hg19Bed)
for line in hg19Fp:
    line = line.rstrip()
    bb = Bed.Bed(line.split())
    hg19Coordinates[bb.name] = bb
hg19Fp.close

#
# Next, read through each entry in GRCh37-lite coordinates.
# At each entry, look up the hg19 coordinates, which will be
# used later for comparison with other tables in hg19.  Also at
# each entry, separate the gene name and cluster ID, build a
# GRCh37-lite locus string, and store the gene name, cluster ID,
# gene locus string, and hg19 coordinates within separate columns.
lineCount = 0
grch37LiteFp = open(args.grch37LiteBed)
for line in grch37LiteFp:
    line = line.rstrip()
    grch37LiteCoords = Bed.Bed(line.split())