Esempio n. 1
0
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if len(args) < 2:
		doc_optparse.exit()
	
	# try opening the file both ways, in case the arguments got confused
	try:
		gff_file = gff.input(args[1])
		twobit_file = twobit.input(args[0])
	except Exception:
		gff_file = gff.input(args[0])
		twobit_file = twobit.input(args[1])
	
	for record in gff_file:
		if record.seqname.startswith("chr"):
			chr = record.seqname
		else:
			chr = "chr" + record.seqname
		
		ref_seq = twobit_file[chr][(record.start - 1):record.end]
		
		if option.diff:
			if record.attributes.has_key("ref_allele"):
				if record.attributes["ref_allele"].strip("\"") == ref_seq.upper():
					continue
		
		record.attributes["ref_allele"] = ref_seq.upper()
		print record
Esempio n. 2
0
def match2ref(gff_input, twobit_filename):

    # Iff gff_filename is a string ending with ".gz", assume gzip compressed
    gff_file = None
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_file = gff.input(gzip.open(gff_input))
    else:
        # GFF will interpret if gff_filename is string containing path
        # to a GFF-formatted text file, or a string generator
        # (e.g. file object) with GFF-formatted strings
        gff_file = gff.input(gff_input)

    twobit_file = twobit.input(twobit_filename)

    header_done = False

    # Process input data to get ref allele
    for record in gff_file:
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if not header_done:
            yield "##gff-version " + gff_file.data[0]
            yield "##genome-build " + gff_file.data[1]
            yield "# Produced by: gff_twobit_query.py"
            yield "# Date: " + datetime.datetime.now().isoformat(' ')
            header_done = True

        # Skip REF lines
        if record.feature == "REF":
            yield str(record)
            continue

        # Add "chr" to chromosome ID if missing
        if record.seqname.startswith("chr"):
            chr = record.seqname
        else:
            chr = "chr" + record.seqname

        ref_seq = "-"  # represents variant with length zero
        if (record.end - (record.start - 1)) > 0:
            ref_seq = twobit_file[chr][(record.start - 1):record.end]
        if ref_seq == '':
            sys.stderr.write(
                "ERROR: this location does not exist in the reference genome. Start: %d, end: %d. Perhaps the input is aligned against a different reference genome?\n"
                % (record.start, record.end))
            sys.exit()

        if record.attributes:
            # If reference at this pos, note this and remove attributes data.
            if ("alleles" in record.attributes
                    and record.attributes["alleles"] == ref_seq.upper()):
                record.feature = "REF"
                record.attributes = None
            else:
                record.attributes["ref_allele"] = ref_seq.upper()
            yield str(record)
def match2ref(gff_input, twobit_filename):

    # Iff gff_filename is a string ending with ".gz", assume gzip compressed
    gff_file = None
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_file = gff.input(gzip.open(gff_input))
    else:
        # GFF will interpret if gff_filename is string containing path 
        # to a GFF-formatted text file, or a string generator 
        # (e.g. file object) with GFF-formatted strings
        gff_file = gff.input(gff_input)
    
    twobit_file = twobit.input(twobit_filename)

    header_done = False
    
    # Process input data to get ref allele
    for record in gff_file:
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if not header_done:
            yield "##gff-version " + gff_file.data[0]
            yield "##genome-build " + gff_file.data[1]
            yield "# Produced by: gff_twobit_query.py"
            yield "# Date: " + datetime.datetime.now().isoformat(' ')
            header_done = True
        
        # Skip REF lines
        if record.feature == "REF":
            yield str(record)
            continue

        # Add "chr" to chromosome ID if missing
        if record.seqname.startswith("chr"):
            chr = record.seqname
        else:
            chr = "chr" + record.seqname

        ref_seq = "-"  # represents variant with length zero
        if (record.end - (record.start - 1)) > 0:
            ref_seq = twobit_file[chr][(record.start - 1):record.end]
        if ref_seq == '':
            sys.stderr.write ("ERROR: this location does not exist in the reference genome. Start: %d, end: %d. Perhaps the input is aligned against a different reference genome?\n" % (record.start, record.end))
            sys.exit() 

        if record.attributes:
            # If reference at this pos, note this and remove attributes data.
            if ("alleles" in record.attributes and 
                record.attributes["alleles"] == ref_seq.upper()):
                record.feature = "REF"
                record.attributes = None
            else:
                record.attributes["ref_allele"] = ref_seq.upper()
            yield str(record)
	# return if we don't have the correct arguments
	if len(sys.argv) < 3:
		raise SystemExit(__doc__.replace("%prog", sys.argv[0]))
	
	# first, try to connect to the databases
	try:
		connection = MySQLdb.connect(host=DB_HOST, user=DB_READ_USER, passwd=DB_READ_PASSWD, db=DB_READ_DATABASE)
		cursor = connection.cursor()
	except MySQLdb.OperationalError, message:
		print "Error %d while connecting to database: %s" % (message[0], message[1])
		sys.exit()
	
	# try opening the file both ways, in case the arguments got confused
	try:
		gff_file = gff.input(sys.argv[2])
		twobit_file = twobit.input(sys.argv[1])
	except Exception:
		gff_file = gff.input(sys.argv[1])
		twobit_file = twobit.input(sys.argv[2])
	
	for record in gff_file:
		if record.seqname.startswith("chr"):
			chr = record.seqname
		else:
			chr = "chr" + record.seqname
		
		# recall that record.start is 1-based, but the database is not
		cursor.execute(query, (chr, record.start - 1, record.end - 1))
		data = cursor.fetchall()
		
		# go away if we have a non-coding sequence
Esempio n. 5
0
def get_allele_freqs(password, getev_file, excluded=None, chromfile=None, 
                     outputfile=None):
    # Set up output, genome inputs, GET-Evidence variants, and twobit reference.
    if outputfile:
        print "Setting up output file"
        f_out = autozip.file_open(outputfile, 'w')
    else:
        f_out = None
    genome_ids = get_genome_list(password, excluded)
    if chromfile:
        if f_out:
            print "Getting chromosomes..."
        chroms = read_single_items(chromfile)
    else:
        chroms = None
    if f_out:
        print "Reading GET-Ev flat file (takes a couple minutes)..."
    getev_variants = load_getev(getev_file)
    if f_out:
        print "Loading twobit genome..."
    twobit_genome = twobit.input(TWOBIT_PATH)
    if f_out:
        print("Setting up GenomeSet (may be slow if each genome has to advance " +
              "to target chromosomes)...")
        genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants,
                               verbose=True)
    else:
        genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants)
    if f_out:
        print "Find earliest ends"
    earliest_ends = genome_set.earliest_ends() 
    #print earliest_ends

    # Move through the genomes to find allele frequencies
    while genome_set.genomes:
        # Move ahead of all "earliest ends" & save new earliest.
        next_earliest = genome_set.advance_all_past_end_pos(earliest_ends[0])

        # Check all old "earliest ends" positions for interesting variants.
        has_var = []
        is_interesting = False
        for position in earliest_ends:
            #print position
            if not position['ref']:
                has_var.append(position)
                #is_interesting = True
                if 'amino_acid' in position or 'getev_id' in position:
                    is_interesting = True

        #if is_interesting:
        #    print "Earliest ends: " + str(earliest_ends)
        #    print [(x.id, x.data[-1]) for x in genome_set.genomes]
        #    if has_var:
        #        print "Var pos: " + str(has_var)

        # If there are interesting variants, calculate allele frequency.
        if has_var and is_interesting:
            # Check if another genomes has an overlapping variant extending 
            # beyond this position, we're not ready to evaluate this yet 
            # (it will be caught when the later overlapping one comes up).
            if genome_set.no_later_var(has_var):
                freqout = genome_set.eval_var_freq(has_var, twobit_genome)
                if f_out:
                    f_out.write(freqout + '\n')
                else:
                    print freqout

        genome_set.clean_out_prior_pos(earliest_ends)

        # Reset "earliest end" to next earliest positions.
        earliest_ends = next_earliest
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if len(args) < 1:
		doc_optparse.exit()
	
	# first, try to connect to the database
	try:
		connection = MySQLdb.connect(host=DB_HOST, user=DB_READ_USER, passwd=DB_READ_PASSWD, db=DB_READ_DATABASE)
		cursor = connection.cursor()
	except MySQLdb.OperationalError, message:
		print "Error %d while connecting to database: %s" % (message[0], message[1])
		sys.exit()
	
	if option.reference:
		twobit_file = twobit.input(option.reference)
	
	for line in fileinput.input(args[0]):
		l = line.strip().split('\t')
		if len(l) < 5:
			print >> sys.stderr, l
		
		# input lines are in the form:
		# chromosome, position, rs, genotype, phenotype, pubmed (optional)
		if l[0].startswith("chr") or l[0] == "None":
			chr = l[0]
		else:
			chr = "chr" + l[0]
		try:
			pos = int(l[1])
		except ValueError:
def predict_nonsynonymous(gff_input, twobit_path, transcript_path, progresstracker=False):
    twobit_file = twobit.input(twobit_path)
    transcript_input = transcript_file(transcript_path)

    # Set up gff_data
    gff_data = None
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_data = gff.input(gzip.open(gff_input))
    else:
        # GFF will interpret if gff_filename is string containing path
        # to a GFF-formatted text file, or a string generator
        # (e.g. file object) with GFF-formatted strings
        gff_data = gff.input(gff_input)

    header_done = False

    for record in gff_data:
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if not header_done:
            yield "##genome-build " + gff_data.data[1]
            yield "# Produced by: gff_nonsynonymous_filter.py"
            yield "# Date: " + datetime.datetime.now().isoformat(" ")
            header_done = True

        if record.feature == "REF":
            yield str(record)
            continue

        if record.seqname.startswith("chr"):
            chromosome = record.seqname
        else:
            if record.seqname.startswith("Chr"):
                chromosome = "chr" + record.seqname[3:]
            else:
                chromosome = "chr" + record.seqname
        if progresstracker:
            progresstracker.saw(chromosome)

        # record.start is 1-based, but UCSC annotation starts are 0-based, so subtract 1
        record_position = (chromosome, record.start - 1)

        transcripts = transcript_input.cover_next_position(record_position)

        # Skip the rest if no transcripts are returned
        if not transcripts:
            yield str(record)
            continue

        # otherwise, cycle through
        nonsyn_inferences = []
        splice_inferences = []
        ucsc_transcripts = []
        is_nonsynonymous = is_splice = False

        for data in transcripts:
            # need to make "d" match up with transcript file order
            # d : geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds
            #     0, 3, 6, 7, 9, 10
            d = (data[0], data[3], int(data[6]), int(data[7]), data[9], data[10])
            i = infer_function(twobit_file, record, *d)
            if i[0] == "nonsynonymous coding":
                nonsyn_inferences.append("%s %s" % (d[0], i[2]))
                is_nonsynonymous = True
                ucsc_transcripts.append(data[1])
            elif i[0] == "splice site":
                splice_inferences.append("%s %s " % (d[0], i[2]))
                is_splice = True

        # set the attribute if we can
        if (not is_nonsynonymous) and (not is_splice):
            yield str(record)
        else:
            if len(nonsyn_inferences) > 0:
                unique_inferences = unique(nonsyn_inferences)
                unique_inferences.sort(key=str.lower)
                record.attributes["amino_acid"] = "/".join(unique_inferences)
                record.attributes["ucsc_trans"] = ",".join(ucsc_transcripts)
            if len(splice_inferences) > 0:
                # Not going to report splice sites for now, but leaving the
                # code here because we hope to later. - Madeleine 2010/11/29
                pass
                # unique_inferences = unique(splice_inferences)
                # unique_inferences.sort(key=str.lower)
                # record.attributes["splice"] = "/".join(unique_inferences)
            yield str(record)
Esempio n. 8
0
def get_allele_freqs(password,
                     getev_file,
                     excluded=None,
                     chromfile=None,
                     outputfile=None):
    # Set up output, genome inputs, GET-Evidence variants, and twobit reference.
    if outputfile:
        print "Setting up output file"
        f_out = autozip.file_open(outputfile, 'w')
    else:
        f_out = None
    genome_ids = get_genome_list(password, excluded)
    if chromfile:
        if f_out:
            print "Getting chromosomes..."
        chroms = read_single_items(chromfile)
    else:
        chroms = None
    if f_out:
        print "Reading GET-Ev flat file (takes a couple minutes)..."
    getev_variants = load_getev(getev_file)
    if f_out:
        print "Loading twobit genome..."
    twobit_genome = twobit.input(TWOBIT_PATH)
    if f_out:
        print(
            "Setting up GenomeSet (may be slow if each genome has to advance "
            + "to target chromosomes)...")
        genome_set = GenomeSet(genome_ids,
                               chroms=chroms,
                               getev_vars=getev_variants,
                               verbose=True)
    else:
        genome_set = GenomeSet(genome_ids,
                               chroms=chroms,
                               getev_vars=getev_variants)
    if f_out:
        print "Find earliest ends"
    earliest_ends = genome_set.earliest_ends()
    #print earliest_ends

    # Move through the genomes to find allele frequencies
    while genome_set.genomes:
        # Move ahead of all "earliest ends" & save new earliest.
        next_earliest = genome_set.advance_all_past_end_pos(earliest_ends[0])

        # Check all old "earliest ends" positions for interesting variants.
        has_var = []
        is_interesting = False
        for position in earliest_ends:
            #print position
            if not position['ref']:
                has_var.append(position)
                #is_interesting = True
                if 'amino_acid' in position or 'getev_id' in position:
                    is_interesting = True

        #if is_interesting:
        #    print "Earliest ends: " + str(earliest_ends)
        #    print [(x.id, x.data[-1]) for x in genome_set.genomes]
        #    if has_var:
        #        print "Var pos: " + str(has_var)

        # If there are interesting variants, calculate allele frequency.
        if has_var and is_interesting:
            # Check if another genomes has an overlapping variant extending
            # beyond this position, we're not ready to evaluate this yet
            # (it will be caught when the later overlapping one comes up).
            if genome_set.no_later_var(has_var):
                freqout = genome_set.eval_var_freq(has_var, twobit_genome)
                if f_out:
                    f_out.write(freqout + '\n')
                else:
                    print freqout

        genome_set.clean_out_prior_pos(earliest_ends)

        # Reset "earliest end" to next earliest positions.
        earliest_ends = next_earliest
def predict_nonsynonymous(gff_input,
                          twobit_path,
                          transcript_path,
                          progresstracker=False):
    twobit_file = twobit.input(twobit_path)
    transcript_input = transcript_file(transcript_path)

    # Set up gff_data
    gff_data = None
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_data = gff.input(gzip.open(gff_input))
    else:
        # GFF will interpret if gff_filename is string containing path
        # to a GFF-formatted text file, or a string generator
        # (e.g. file object) with GFF-formatted strings
        gff_data = gff.input(gff_input)

    header_done = False

    for record in gff_data:
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if not header_done:
            yield "##genome-build " + gff_data.data[1]
            yield "# Produced by: gff_nonsynonymous_filter.py"
            yield "# Date: " + datetime.datetime.now().isoformat(' ')
            header_done = True

        if record.feature == "REF":
            yield str(record)
            continue

        if record.seqname.startswith("chr"):
            chromosome = record.seqname
        else:
            if record.seqname.startswith("Chr"):
                chromosome = "chr" + record.seqname[3:]
            else:
                chromosome = "chr" + record.seqname
        if progresstracker: progresstracker.saw(chromosome)

        # record.start is 1-based, but UCSC annotation starts are 0-based, so subtract 1
        record_position = (chromosome, record.start - 1)

        transcripts = transcript_input.cover_next_position(record_position)

        # Skip the rest if no transcripts are returned
        if (not transcripts):
            yield str(record)
            continue

        # otherwise, cycle through
        nonsyn_inferences = []
        splice_inferences = []
        ucsc_transcripts = []
        is_nonsynonymous = is_splice = False

        for data in transcripts:
            # need to make "d" match up with transcript file order
            # d : geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds
            #     0, 3, 6, 7, 9, 10
            d = (data[0], data[3], int(data[6]), int(data[7]), data[9],
                 data[10])
            i = infer_function(twobit_file, record, *d)
            if i[0] == "nonsynonymous coding":
                nonsyn_inferences.append("%s %s" % (d[0], i[2]))
                is_nonsynonymous = True
                ucsc_transcripts.append(data[1])
            elif i[0] == "splice site":
                splice_inferences.append("%s %s " % (d[0], i[2]))
                is_splice = True

        # set the attribute if we can
        if (not is_nonsynonymous) and (not is_splice):
            yield str(record)
        else:
            if len(nonsyn_inferences) > 0:
                unique_inferences = unique(nonsyn_inferences)
                unique_inferences.sort(key=str.lower)
                record.attributes["amino_acid"] = "/".join(unique_inferences)
                record.attributes["ucsc_trans"] = ",".join(ucsc_transcripts)
            if len(splice_inferences) > 0:
                # Not going to report splice sites for now, but leaving the
                # code here because we hope to later. - Madeleine 2010/11/29
                pass
                # unique_inferences = unique(splice_inferences)
                # unique_inferences.sort(key=str.lower)
                # record.attributes["splice"] = "/".join(unique_inferences)
            yield str(record)
Esempio n. 10
0
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if len(args) < 2:
		doc_optparse.exit()
	
	flank = int(option.flank or 0)
	
	# try opening the file both ways, in case the arguments got confused
	try:
		gff_file = gff.input(args[1])
		twobit_file = twobit.input(args[0])
	except Exception:
		gff_file = gff.input(args[0])
		twobit_file = twobit.input(args[1])
	
	# initialize a set of variables to keep track of uniqueness, if we need them
	if option.unique:
		previous_record = None
		previous_ref_seq = None
		repetition_count = 1
	
	for record in gff_file:
		# if we're using the unique option, output the previous record only when
		# we're sure we've seen all repetitions of it
		if option.unique and record == previous_record:
			repetition_count += 1
			continue
		elif option.unique:
			if previous_record:
				previous_record.attributes["repetition_count"] = str(repetition_count)
				print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)
			repetition_count = 1
			previous_record = record

		if record.seqname.startswith("chr"):
			chr = record.seqname
		else:
			chr = "chr" + record.seqname
		
		ref_seq = twobit_file[chr][(record.start - 1):record.end]

		if flank != 0:
			# calculate the flanks (these variables are 0-based)
			left_flank_start = record.start - flank - 1
			left_flank_end = record.start - 1
			if left_flank_start < 0:
				left_flank_start = 0
			
			right_flank_start = record.end
			right_flank_end = record.end + flank
			
			# now find them
			left_flank_seq = twobit_file[chr][left_flank_start:left_flank_end]
			right_flank_seq = twobit_file[chr][right_flank_start:right_flank_end]
			ref_seq = left_flank_seq + "\n\n" + ref_seq + "\n\n" + right_flank_seq
		
		if option.strand and record.strand == "-":
			ref_seq = reverse_complement(ref_seq)
		
		# we don't output the current record if we're using the unique option
		if option.unique:
			previous_ref_seq = ref_seq
		else:
			print FastaRecord(str(record).replace("\t", "|"), ref_seq)
	
	# we'll have one last record yet to output if we used the unique option
	if option.unique:
		previous_record.attributes["repetition_count"] = str(repetition_count)
		print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)