def header_data(gff_in, metadata=dict(), check_ref=0): """Read GFF header data from file, store or return metadata Optionally also checks the first N lines for records where the type is "REF" (third column). (Our genome processing treats these as regions where the genotype is "called" as matching the reference genome.) """ # Set up GFF data if isinstance(gff_in, str) and re.search(r'\.gz$', gff_in): gff_data = gff.input(gzip.open(gff_in)) else: gff_data = gff.input(gff_in) # Pull record to force GFFFile to read through header, then store metadata. record = gff_data.next() metadata['gff-format'], metadata['build'] = gff_data.data[0:2] # Check for REF lines if we asked to do this. False unless we see some. if check_ref > 0: metadata['has_ref'] = False for i in range(check_ref): try: if record.feature == "REF": metadata['has_ref'] = True break record = gff_data.next() except StopIteration: break return metadata
def main(): # parse options option, args = doc_optparse.parse(__doc__) if len(args) < 2: doc_optparse.exit() # try opening the file both ways, in case the arguments got confused try: gff_file = gff.input(args[1]) twobit_file = twobit.input(args[0]) except Exception: gff_file = gff.input(args[0]) twobit_file = twobit.input(args[1]) for record in gff_file: if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname ref_seq = twobit_file[chr][(record.start - 1):record.end] if option.diff: if record.attributes.has_key("ref_allele"): if record.attributes["ref_allele"].strip("\"") == ref_seq.upper(): continue record.attributes["ref_allele"] = ref_seq.upper() print record
def main(): # return if we don't have the correct arguments if len(sys.argv) < 3: raise SystemExit(__doc__.replace("%prog", sys.argv[0])) g1 = gff.input(sys.argv[1]) g2 = gff.input(sys.argv[2]) for line in g1.intersect(g2): print line
def match2ref(gff_input, twobit_filename): # Iff gff_filename is a string ending with ".gz", assume gzip compressed gff_file = None if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)): gff_file = gff.input(gzip.open(gff_input)) else: # GFF will interpret if gff_filename is string containing path # to a GFF-formatted text file, or a string generator # (e.g. file object) with GFF-formatted strings gff_file = gff.input(gff_input) twobit_file = twobit.input(twobit_filename) header_done = False # Process input data to get ref allele for record in gff_file: # Have to do this after calling the first record to # get the iterator to read through the header data if not header_done: yield "##gff-version " + gff_file.data[0] yield "##genome-build " + gff_file.data[1] yield "# Produced by: gff_twobit_query.py" yield "# Date: " + datetime.datetime.now().isoformat(' ') header_done = True # Skip REF lines if record.feature == "REF": yield str(record) continue # Add "chr" to chromosome ID if missing if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname ref_seq = "-" # represents variant with length zero if (record.end - (record.start - 1)) > 0: ref_seq = twobit_file[chr][(record.start - 1):record.end] if ref_seq == '': sys.stderr.write( "ERROR: this location does not exist in the reference genome. Start: %d, end: %d. Perhaps the input is aligned against a different reference genome?\n" % (record.start, record.end)) sys.exit() if record.attributes: # If reference at this pos, note this and remove attributes data. if ("alleles" in record.attributes and record.attributes["alleles"] == ref_seq.upper()): record.feature = "REF" record.attributes = None else: record.attributes["ref_allele"] = ref_seq.upper() yield str(record)
def match2dbSNP(gff_input_file, dbsnp_file): # Set up dbSNP input dbSNP_input = dbSNP(dbsnp_file) # Create genome_file record generator gff_data = None if isinstance(gff_input_file, str) and (re.match(".*\.gz$", gff_input_file)): gff_data = gff.input(gzip.open(gff_input_file)) else: gff_data = gff.input(gff_input_file) header_done = False for record in gff_data: # Have to do this after calling the first record to # get the iterator to read through the header data if not header_done: yield "##gff-version " + gff_data.data[0] yield "##genome-build " + gff_data.data[1] yield "# Produced by: gff_dbsnp_query.py" yield "# Date: " + datetime.datetime.now().isoformat(' ') header_done = True if record.feature == "REF": yield str(record) continue # chromosome prefix not used by dbSNP, so it is removed if present if record.seqname.startswith("chr") or record.seqname.startswith("Chr"): chromosome = record.seqname[3:] else: chromosome = record.seqname # position is adjusted to match the zero-start used by dbSNP positions record_position = (chromosome, record.start - 1) dbSNP_position = dbSNP_input.up_to_position(record_position) dbSNP_data = dbSNP_input.data if (dbSNP_position and dbSNP_input.comp_position(dbSNP_position,record_position) == 0): dbSNP_datum = "dbsnp:rs%s" % dbSNP_data[0] record_dbxref_data = [] if record.version >= 3: if "Dbxref" in record.attributes: record_dbxref_data = record.attributes["Dbxref"].split(",") if not any([re.search(dbSNP_data[0],datum) for datum in record_dbxref_data]): record_dbxref_data.append(dbSNP_datum) record.attributes["Dbxref"] = ",".join(record_dbxref_data) else: if "db_xref" in record.attributes: record_dbxref_data = record.attributes["db_xref"].split(",") if not any([re.search(dbSNP_data[0],datum) for datum in record_dbxref_data]): record_dbxref_data.append(dbSNP_datum) record.attributes["db_xref"] = ",".join(record_dbxref_data) yield str(record)
def match2ref(gff_input, twobit_filename): # Iff gff_filename is a string ending with ".gz", assume gzip compressed gff_file = None if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)): gff_file = gff.input(gzip.open(gff_input)) else: # GFF will interpret if gff_filename is string containing path # to a GFF-formatted text file, or a string generator # (e.g. file object) with GFF-formatted strings gff_file = gff.input(gff_input) twobit_file = twobit.input(twobit_filename) header_done = False # Process input data to get ref allele for record in gff_file: # Have to do this after calling the first record to # get the iterator to read through the header data if not header_done: yield "##gff-version " + gff_file.data[0] yield "##genome-build " + gff_file.data[1] yield "# Produced by: gff_twobit_query.py" yield "# Date: " + datetime.datetime.now().isoformat(' ') header_done = True # Skip REF lines if record.feature == "REF": yield str(record) continue # Add "chr" to chromosome ID if missing if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname ref_seq = "-" # represents variant with length zero if (record.end - (record.start - 1)) > 0: ref_seq = twobit_file[chr][(record.start - 1):record.end] if ref_seq == '': sys.stderr.write ("ERROR: this location does not exist in the reference genome. Start: %d, end: %d. Perhaps the input is aligned against a different reference genome?\n" % (record.start, record.end)) sys.exit() if record.attributes: # If reference at this pos, note this and remove attributes data. if ("alleles" in record.attributes and record.attributes["alleles"] == ref_seq.upper()): record.feature = "REF" record.attributes = None else: record.attributes["ref_allele"] = ref_seq.upper() yield str(record)
def main(): # return if we don't have the correct arguments if len(sys.argv) < 2: raise SystemExit(__doc__.replace("%prog", sys.argv[0])) yh_gff = gff.input(sys.argv[1], version=3) for record in yh_gff: # SNPs only, please if record.feature != "SNP": continue # downgrade record.version = 2 # standardize a few things about the GFF record alleles = record.attributes["allele"].split("/") if len(alleles) == 2 and alleles[0] == alleles[1]: record.attributes["alleles"] = alleles[0] else: record.attributes["alleles"] = "/".join(alleles) del record.attributes["allele"] record.attributes["ref_allele"] = record.attributes["ref"] del record.attributes["ref"] if record.attributes["alleles"].find("/") == -1: record.attributes["counts"] = record.attributes["support1"] else: record.attributes["counts"] = "%s/%s" % ( record.attributes["support1"], record.attributes["support2"]) del record.attributes["support2"] del record.attributes["support1"] print record
def main(): # return if we don't have the correct arguments if len(sys.argv) < 2: raise SystemExit(__doc__.replace("%prog", sys.argv[0])) yh_gff = gff.input(sys.argv[1], version=3) for record in yh_gff: # SNPs only, please if record.feature != "SNP": continue # downgrade record.version = 2 # standardize a few things about the GFF record alleles = record.attributes["allele"].split("/") if len(alleles) == 2 and alleles[0] == alleles[1]: record.attributes["alleles"] = alleles[0] else: record.attributes["alleles"] = "/".join(alleles) del record.attributes["allele"] record.attributes["ref_allele"] = record.attributes["ref"] del record.attributes["ref"] if record.attributes["alleles"].find("/") == -1: record.attributes["counts"] = record.attributes["support1"] else: record.attributes["counts"] = "%s/%s" % (record.attributes["support1"], record.attributes["support2"]) del record.attributes["support2"] del record.attributes["support1"] print record
def __init__(self, f_child, f_parA, f_parB, mend_errs): """Initializes class variables, opens input files.""" self.filenames = {0: f_child, 1: f_parA} self.mend_errs = mend_errs self.gffs = {0: None, 1: None} # Positions are a tuple of chromosome, start, end, and gff record self.positions = {0: ('chr1', -1, -1, None), 1: ('chr1', -1, -1, None)} if (not f_parB == None): self.filenames[2] = f_parB self.gffs[2] = None self.positions[2] = ('chr1', -1, -1, None) # Set up input/output files for idx, filename in self.filenames.iteritems(): self.gffs[idx] = gff.input(autozip.file_open(filename, 'r'))
def __init__(self, f_child, f_parA, f_parB, mend_errs): """Initializes class variables, opens input files.""" self.filenames = {0 : f_child, 1 : f_parA} self.mend_errs = mend_errs self.gffs = {0 : None, 1 : None} # Positions are a tuple of chromosome, start, end, and gff record self.positions = {0 : ('chr1', -1, -1, None), 1 : ('chr1', -1, -1, None)} if (not f_parB == None): self.filenames[2] = f_parB self.gffs[2] = None self.positions[2] = ('chr1', -1, -1, None) # Set up input/output files for idx, filename in self.filenames.iteritems(): self.gffs[idx] = gff.input(autozip.file_open(filename, 'r'))
def main(): # return if we don't have the correct arguments if len(sys.argv) < 2: raise SystemExit(__doc__.replace("%prog", sys.argv[0])) watson_gff = gff.input(sys.argv[1]) for record in watson_gff: # standardize feature name record.feature = "SNP" # double check alleles and allele counts alleles = record.attributes["alleles"] ref_allele = record.attributes["ref_allele"] ref_counts = int(record.attributes["ref_counts"]) oth_counts = int(record.attributes["oth_counts"]) # if we're homozygous for the other allele, then we exclude # the reference allele from the list of alleles if ref_counts == 0: if alleles.startswith(ref_allele): alleles = alleles[-1] else: alleles = alleles[0] counts = str(oth_counts) # otherwise, we make sure that the first allele listed is the # reference allele, and create the counts attribute accordingly elif alleles.startswith(ref_allele): counts = "%s/%s" % (ref_counts, oth_counts) # this shouldn't happen, but in case, we do it the other way # if necessary else: counts = "%s/%s" % (oth_counts, ref_counts) # now we modify the record and output record.attributes["alleles"] = alleles record.attributes["counts"] = counts del record.attributes["ref_counts"] del record.attributes["oth_counts"] print record
def genome_metadata(gff_input, genome_stats_file, progresstracker): """Take GFF, track and record associated metadata, yield same GFF lines Required arguments: gff_input: file or GFF-formatted string generator genome_stats_file: str, path to a text file containing chromosome sizes progresstracker: ProgressTracker object The following keys will store metadata in progresstracker.metadata: chromosomes: list of str, chromosome names called_num: int, # of positions called match_num: int, # of positions called w/chr matching ref ref_all_num: int, # of positions in reference genome (includes unplaceable) ref_nogap_num: int, # of placeable positions in reference genome called_frac_all: float, fraction of reference called (includes unplaceable) called_frac_nogap: float, fraction of placeable reference called Returns a generator, yielding same GFF-formatted strings as were inputed. """ # 'chromosomes_raw' is a list of all the raw chromosome sequences seen. # 'chromosomes' has the same names edited, if needed, to match ref_genome. chromosomes_raw = list() # 'called_num' counts total positions called, while 'match_num' only counts # positions which match a chromosome ID in the ref_genome data. called_num = 0 match_num = 0 # 'ref_all_num' and 'ref_nogap_num' increment total and placeable genome # sizes (respectively) when new chromosomes are seen (for example, the # lengths for chrY are only added if chrY was seen). ref_all_num = 0 ref_nogap_num = 0 # Set up gff_data. if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)): gff_data = gff.input(gzip.open(gff_input)) else: gff_data = gff.input(gff_input) # Get chromosome lengths (total and placeable) for reference genome. try: ref_genome = get_genome_stats(progresstracker.metadata['build'], genome_stats_file) except KeyError: ref_genome = get_genome_stats(DEFAULT_BUILD, genome_stats_file) # Initialize chromosomes list, we'll add them as we see them. progresstracker.metadata['chromosomes'] = list() # Progress through GFF input. header_done = False for record in gff_data: # Have to do this after calling the first record to # get the iterator to read through the header data if not header_done: yield "##gff-version " + gff_data.data[0] yield "##genome-build " + gff_data.data[1] yield "# Produced by: get_metadata.py" header_done = True # Record number of positions called. dist = (record.end - (record.start - 1)) called_num += dist is_in_ref_genome = (record.seqname in ref_genome or "chr" + record.seqname in ref_genome or "chr" + record.seqname[3:] in ref_genome) if is_in_ref_genome: match_num += dist # If this is a new chromosome: (1) Add it to our chromosomes list, # (2) increase genome size variables (ref_all_num and ref_nogap_num) # (3) call progresstracker.saw(). if record.seqname not in chromosomes_raw: chromosomes_raw.append(record.seqname) # Standardize chromosome name for metadata storage. chr_name = "" if record.seqname in ref_genome: chr_name = record.seqname elif "chr" + record.seqname in ref_genome: chr_name = "chr" + record.seqname elif "chr" + record.seqname[3:] in ref_genome: chr_name = "chr" + record.seqname[3:] if chr_name: progresstracker.metadata['chromosomes'].append(chr_name) ref_all_num += ref_genome[record.seqname]['seq_all'] ref_nogap_num += ref_genome[record.seqname]['seq_nogap'] progresstracker.saw(chr_name) yield str(record) progresstracker.metadata['called_num'] = called_num progresstracker.metadata['match_num'] = match_num progresstracker.metadata['ref_all_num'] = ref_all_num progresstracker.metadata['ref_nogap_num'] = ref_nogap_num if ref_all_num > 0: called_frac_all = match_num * 1.0 / ref_all_num progresstracker.metadata['called_frac_all'] = called_frac_all if ref_nogap_num > 0: called_frac_nogap = match_num * 1.0 / ref_nogap_num progresstracker.metadata['called_frac_nogap'] = called_frac_nogap
# first, try to connect to the databases try: connection = MySQLdb.connect(host=DB_HOST, user=HGMD_USER, passwd=HGMD_PASSWD, db=HGMD_DATABASE) cursor = connection.cursor() except MySQLdb.OperationalError, message: sys.stderr.write ("Error %d while connecting to database: %s" % (message[0], message[1])) sys.exit() # make sure the required table is really there try: cursor.execute ('DESCRIBE mutation') except MySQLdb.Error: sys.stderr.write ("No mutation table => empty output") sys.exit() gff_file = gff.input(sys.argv[1]) for record in gff_file: # lightly parse alleles alleles = record.attributes["alleles"].strip("\"").split("/") ref_allele = record.attributes["ref_allele"].strip("\"") # determine zygosity if len(alleles) == 1: zygosity = "hom" else: zygosity = "het" # examine each amino acid change amino_acid_changes = record.attributes["amino_acid"].strip("\"").split("/") for a in amino_acid_changes: amino_acid = a.split(" ")
def main(): f = gff.input(sys.argv[1]) for record in f: print record.id, record.attributes
def report_uncovered(gff_input, transcript_filename, genetests_filename, output_file=None, progresstracker=None): """Compare GFF records to transcripts to find missing coding regions Reports missing regions, yielding JSON-formatted strings. If output_file is provided, instead yields the GFF-formatted strings from gff_input and writes the JSON-formatted report strings to file. Required arguments: gff_input: GFF-formatted strings, string generator or file (can be .gz) transcript_filename: transcripts file genetests_filename: genetests file Optional arguments: output_file: If provided, opens and writes to this location (see above) progresstracker: If provided, records metadata to progresstracker.metadata """ # Set up GFF input. If it ends with '.gz', assume gzip compressed. if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)): gff_data = gff.input(gzip.open(gff_input)) else: gff_data = gff.input(gff_input) # set up transcript file input transcript_input = transcript.Transcript_file(transcript_filename) # grab genetests gene names genetests_input = open(genetests_filename) genetests_names = set() for line in genetests_input: if (re.match("#", line)): continue data = line.split("\t") if data[4] == "na": continue if not (re.match(".*Clinical", data[5])): # currently we require "clinical testing available" continue names = data[4].split("|") for name in names: genetests_names.add(name) # Set up optional output. f_out = False if output_file: if re.match(r'\.gz$', output_file): f_out = gzip.open(output_file, 'w') else: f_out = open(output_file, 'w') # If progresstracker was sent, track these for metadata. if progresstracker: progresstracker.metadata['ref_coding_n'] = 0 progresstracker.metadata['ref_coding_clintest_n'] = 0 progresstracker.metadata['called_coding_n'] = 0 progresstracker.metadata['called_coding_clintest_n'] = 0 # Store to-be-examined regions, we'll remove covered regions from this list. # key: Transcript object # value: list of tuples (chr (string), start (int), end (int)) # Note: Start is 1-based, not 0-based as is in transcript files examined_regions = {} header_done = False for record in gff_data: if not header_done: yield "##gff-version " + gff_data.data[0] yield "##genome-build " + gff_data.data[1] yield "# Produced by: call_missing.py" header_done = True if f_out: yield str(record) # Move forward in transcripts until past record end. chromosome = std_chr_name(record.seqname) next_region = (chromosome, record.start, record.end) removed_transcripts = transcript_input.cover_next_position(next_region) for curr_ts in transcript_input.transcripts: # Add to examined_regions if new. if (not curr_ts in examined_regions): regions = [] for i in range(len(curr_ts.data["coding_starts"])): region = (curr_ts.data["chr"], (curr_ts.data["coding_starts"][i] + 1), curr_ts.data["coding_ends"][i]) regions.append(region) examined_regions[curr_ts] = regions # Examine regions and remove any covered by the record. curr_ts_regions = examined_regions[curr_ts] examined_regions[curr_ts] = remove_covered(curr_ts_regions, record) # Process past transcripts. results = process_ts_missing(removed_transcripts, examined_regions, genetests_names, progresstracker) for gene_data in results: if gene_data["length"] > 0: if f_out: f_out.write(json.dumps(gene_data) + '\n') else: yield json.dumps(gene_data) # Move through any remaining transcripts and return missing. beyond_end_hack = ("chrZ", 9999999999) removed_transcripts = transcript_input.cover_next_position(beyond_end_hack) remaining_transcripts = removed_transcripts + transcript_input.transcripts results = process_ts_missing(remaining_transcripts, examined_regions, genetests_names, progresstracker) for gene_data in results: if gene_data["length"] > 0: if f_out: f_out.write(json.dumps(gene_data) + '\n') else: yield json.dumps(gene_data)
def main(): # return if we don't have the correct arguments if len(sys.argv) < 2: raise SystemExit(__doc__.replace("%prog", sys.argv[0])) gff_file = gff.input(sys.argv[1]) for record in gff_file: # lightly parse alleles alleles = record.attributes["alleles"].strip("\"").split("/") ref_allele = record.attributes["ref_allele"].strip("\"") # compress identical alleles like "A/A" into just "A" while len(alleles) > 1 and alleles[0].upper() == alleles[1].upper(): alleles.pop(0) trait_allele = None # determine zygosity if len(alleles) == 1: zygosity = "hom" trait_allele = alleles[0] else: zygosity = "het" genotype = "/".join(alleles) if ref_allele in alleles: leftover_alleles = copy(alleles) leftover_alleles.remove(ref_allele) genotype = ref_allele + "/" + "/".join(leftover_alleles) if not trait_allele and len(leftover_alleles) == 1: trait_allele = leftover_alleles[0] # get dbSNP ID if "db_xref" in record.attributes: dbSNP_ID = record.attributes["db_xref"].lstrip("dbsnp:") else: dbSNP_ID = None # examine each amino acid change if "amino_acid_changes" in record.attributes: amino_acid_changes = record.attributes["amino_acid"].strip( "\"").split("/") else: amino_acid_changes = list() for a in amino_acid_changes: amino_acid = a.split(" ") gene = amino_acid.pop(0) # the first item is always the gene name aa_done = {} for amino_acid_change_and_position in amino_acid: if amino_acid_change_and_position in aa_done: continue aa_done[amino_acid_change_and_position] = 1 if record.start == record.end: coordinates = str(record.start) else: coordinates = str(record.start) + "-" + str(record.end) output = { "chromosome": record.seqname, "coordinates": coordinates, "gene": gene, "amino_acid_change": amino_acid_change_and_position, "genotype": genotype, "ref_allele": ref_allele, "trait_allele": trait_allele, "zygosity": zygosity, "variant": str(record), } if (dbSNP_ID): output["dbSNP"] = dbSNP_ID print json.dumps(output) # Print one json line if there were no amino acid changes if (not amino_acid_changes): if record.start == record.end: coordinates = str(record.start) else: coordinates = str(record.start) + "-" + str(record.end) output = { "chromosome": record.seqname, "coordinates": coordinates, "genotype": genotype, "ref_allele": ref_allele, "trait_allele": trait_allele, "zygosity": zygosity, "variant": str(record) } if (dbSNP_ID): output["dbSNP"] = dbSNP_ID print json.dumps(output)
def match_getev(gff_in, getev_flat, transcripts_file=None, gene_out_file=None, output_file=None, progresstracker=None): """String generator returning JSON-formatted data from GET-Evidence Required inputs: gff_in: GFF-formated string generator, text, or .gz gzip-compressed getev_flat: JSON-formated text, or .gz gzip-compressed Optional inputs: output_file: if set, print to this & generator instead yields GFF lines progress_tracker: ProgressTracker object from progresstracker.py Each output line yielded is JSON-formatted and corresponds to data for a particular variant. It will always contain 'chr', 'coordinates', 'GET-Evidence', 'genotype', 'autoscore' and at least one of these two possibilities: (1) 'gene' and 'amino_acid_change' or (2) 'dbsnp'. It may also contain 'testable', 'reviewed', and items copied by copy_output_data. """ # Load data from GET-Evidence and Genetests files. getev_by_aa, getev_by_dbsnp = read_getev_flat(getev_flat) genetests_filepath = os.path.join(os.getenv('DATA'), GENETESTS_DATA) genetests_clin, genetests_rev = read_genetests(genetests_filepath) # Set up optional output, will not be compressed. f_json_out = None f_gene_out = None if output_file: f_json_out = open(output_file, 'w') if gene_out_file and transcripts_file: gene_data = dict() f_gene_out = open(gene_out_file, 'w') transcripts = read_transcripts(transcripts_file) # Set up BLOSUM100 matrix to score amino acid disruptiveness. blosum_matrix = blosum100() # Set up GFF data. Can be a string generator, text, or # (if it ends with '.gz') a gzip-compressed text. gff_data = None if isinstance(gff_in, str) and re.search(r'\.gz$', gff_in): gff_data = gff.input(gzip.open(gff_in)) else: gff_data = gff.input(gff_in) for record in gff_data: # If outputing JSON to file, yield GFF data as it's read. if f_json_out: yield str(record) # Ignore regions called as matching reference. if record.feature == 'REF': continue # If producing a gene report, output finished genes if f_gene_out: to_remove = [] for gene in gene_data: if not gene in transcripts: # Remove genes we don't recognize to_remove.append(gene) else: if transcripts[gene]['end'] < record.end: gene_report(f_gene_out, gene, gene_data[gene]) to_remove.append(gene) for gene in to_remove: gene_data.pop(gene) # Track progress if a ProgressTracker was passed to us if progresstracker: progresstracker.saw(record.seqname) # Store data for JSON output as dict. output = dict() # Parse GFF attributes to find the alleles, reference allele, phase, and dbSNP. alleles = record.attributes['alleles'].strip('"').split('/') # don't sort! if len(alleles) == 1: output['genotype'] = alleles[0] elif len(alleles) > 2 or len(alleles) < 1: # Not sure what to do with >2 or 0 alleles! Skip it. continue else: output['genotype'] = '/'.join(sorted(alleles)) ref_allele = record.attributes['ref_allele'].strip('"') output['ref_allele'] = ref_allele if 'phase' in record.attributes: # Add phase attribute for the non-reference allele; # if both non-reference, treat as unphased. phase_data = record.attributes['phase'].strip().split('/') if len(alleles) == 2 and len(phase_data) == 2: if alleles[0] == ref_allele: output['phase'] = phase_data[1] elif alleles[1] == ref_allele: output['phase'] = phase_data[0] dbsnp_ids = [] if 'db_xref' in record.attributes or 'Dbxref' in record.attributes: if 'db_xref' in record.attributes: entries = [d.strip() for d in record.attributes['db_xref'].split(',')] else: entries = [d.strip() for d in record.attributes['Dbxref'].split(',')] for entry in entries: data = entry.split(':') if re.match('dbsnp', data[0]) and re.match('rs', data[1]): dbsnp_ids.append(data[1]) if dbsnp_ids: output["dbSNP"] = ",".join(dbsnp_ids) # Default presence in GET-Evidence is false, set as true later # if a match is found. output['GET-Evidence'] = False # Store position data output['chromosome'] = record.seqname if record.start == record.end: output['coordinates'] = str(record.start) else: output['coordinates'] = str(record.start) + "-" + str(record.end) # If there is an amino acid change reported, look it up based on this. if "amino_acid" in record.attributes: # Get gene and amino acid change, store in output. # Note: parse_aa_change will call sys.exit() if it's misformatted. # TODO: analyze more than the first change, multiple are split by / aa_changes = record.attributes['amino_acid'].split('/') aa_data = aa_changes[0].split() gene, aa_change_and_pos = aa_data[0:2] # "X" is preferred for stop, "*" can break things like URLs. aa_change_and_pos = re.sub(r'\*', r'X', aa_change_and_pos) (aa_from, aa_pos, aa_to) = parse_aa_change(aa_change_and_pos) output["gene"] = gene output["amino_acid_change"] = aa_data[1] # Check if the gene is in Genetests. If so, store result. if gene in genetests_clin: output["testable"] = True if gene in genetests_rev: output["reviewed"] = True # Try to look up in GET-Evidence by amino acid change. aa_key = gene + "-" + aa_change_and_pos if aa_key in getev_by_aa: getev_data = getev_by_aa[aa_key] copy_output_data(getev_data, output) output["GET-Evidence"] = True else: # If not in GET-Evidence by aa, try dbsnp ID. if "dbSNP" in output: dbsnp_ids = output["dbSNP"].split(",") for dbsnp_id in dbsnp_ids: if dbsnp_id in getev_by_dbsnp: getev_data = getev_by_dbsnp[dbsnp_id] output["GET-Evidence"] = True copy_output_data(getev_data, output) output["autoscore"] = autoscore(output, blosum_matrix, aa_from, aa_to) # Quit after first hit passing threshold if output["autoscore"] >= 2 or suff_eval(output): output["dbSNP"] = dbsnp_id break # Calculate autoscore, yield json data if at least 2. output["autoscore"] = autoscore(output, blosum_matrix, aa_from, aa_to) if output["autoscore"] >= 2 or suff_eval(output): # This barfs on Unicode sometimes. try: json_output = str(json.dumps(output, ensure_ascii=False)) except: continue if f_json_out: f_json_out.write(json_output + '\n') else: yield json_output # TODO: print when beyond end of gene, not when new one seen if f_gene_out and 'ucsc_trans' in record.attributes: # We take 1st & ignore multiple transcripts (which are rare) gene = record.attributes['ucsc_trans'].split(',')[0] if gene in gene_data: gene_data[gene].append(output) else: gene_data[gene] = [ output ] else: # If no gene data at all, try dbsnp ID. if "dbSNP" in output: dbsnp_ids = output["dbSNP"].split(",") for dbsnp_id in dbsnp_ids: if dbsnp_id in getev_by_dbsnp: output["GET-Evidence"] = True getev_data = getev_by_dbsnp[dbsnp_id] copy_output_data(getev_data, output) output["autoscore"] = autoscore(output) # Quit after first hit passing threshold if output["autoscore"] >= 2 or suff_eval(output): output["dbSNP"] = dbsnp_id break break # quit after first hit output["autoscore"] = autoscore(output) # Autoscore bar is lower here because you can only get points if # the dbSNP ID is in one of the variant specific databases (max 2). if output["autoscore"] >= 1 or suff_eval(output): # This barfs on Unicode sometimes. try: json_output = str(json.dumps(output, ensure_ascii=False)) except: continue if f_json_out: f_json_out.write(json_output + '\n') else: yield json_output if f_json_out: f_json_out.close() if f_gene_out: f_gene_out.close()
def predict_nonsynonymous(gff_input, twobit_path, transcript_path, progresstracker=False): twobit_file = twobit.input(twobit_path) transcript_input = transcript_file(transcript_path) # Set up gff_data gff_data = None if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)): gff_data = gff.input(gzip.open(gff_input)) else: # GFF will interpret if gff_filename is string containing path # to a GFF-formatted text file, or a string generator # (e.g. file object) with GFF-formatted strings gff_data = gff.input(gff_input) header_done = False for record in gff_data: # Have to do this after calling the first record to # get the iterator to read through the header data if not header_done: yield "##genome-build " + gff_data.data[1] yield "# Produced by: gff_nonsynonymous_filter.py" yield "# Date: " + datetime.datetime.now().isoformat(' ') header_done = True if record.feature == "REF": yield str(record) continue if record.seqname.startswith("chr"): chromosome = record.seqname else: if record.seqname.startswith("Chr"): chromosome = "chr" + record.seqname[3:] else: chromosome = "chr" + record.seqname if progresstracker: progresstracker.saw(chromosome) # record.start is 1-based, but UCSC annotation starts are 0-based, so subtract 1 record_position = (chromosome, record.start - 1) transcripts = transcript_input.cover_next_position(record_position) # Skip the rest if no transcripts are returned if (not transcripts): yield str(record) continue # otherwise, cycle through nonsyn_inferences = [] splice_inferences = [] ucsc_transcripts = [] is_nonsynonymous = is_splice = False for data in transcripts: # need to make "d" match up with transcript file order # d : geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds # 0, 3, 6, 7, 9, 10 d = (data[0], data[3], int(data[6]), int(data[7]), data[9], data[10]) i = infer_function(twobit_file, record, *d) if i[0] == "nonsynonymous coding": nonsyn_inferences.append("%s %s" % (d[0], i[2])) is_nonsynonymous = True ucsc_transcripts.append(data[1]) elif i[0] == "splice site": splice_inferences.append("%s %s " % (d[0], i[2])) is_splice = True # set the attribute if we can if (not is_nonsynonymous) and (not is_splice): yield str(record) else: if len(nonsyn_inferences) > 0: unique_inferences = unique(nonsyn_inferences) unique_inferences.sort(key=str.lower) record.attributes["amino_acid"] = "/".join(unique_inferences) record.attributes["ucsc_trans"] = ",".join(ucsc_transcripts) if len(splice_inferences) > 0: # Not going to report splice sites for now, but leaving the # code here because we hope to later. - Madeleine 2010/11/29 pass # unique_inferences = unique(splice_inferences) # unique_inferences.sort(key=str.lower) # record.attributes["splice"] = "/".join(unique_inferences) yield str(record)
def predict_nonsynonymous(gff_input, twobit_path, transcript_path, progresstracker=False): twobit_file = twobit.input(twobit_path) transcript_input = transcript_file(transcript_path) # Set up gff_data gff_data = None if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)): gff_data = gff.input(gzip.open(gff_input)) else: # GFF will interpret if gff_filename is string containing path # to a GFF-formatted text file, or a string generator # (e.g. file object) with GFF-formatted strings gff_data = gff.input(gff_input) header_done = False for record in gff_data: # Have to do this after calling the first record to # get the iterator to read through the header data if not header_done: yield "##genome-build " + gff_data.data[1] yield "# Produced by: gff_nonsynonymous_filter.py" yield "# Date: " + datetime.datetime.now().isoformat(" ") header_done = True if record.feature == "REF": yield str(record) continue if record.seqname.startswith("chr"): chromosome = record.seqname else: if record.seqname.startswith("Chr"): chromosome = "chr" + record.seqname[3:] else: chromosome = "chr" + record.seqname if progresstracker: progresstracker.saw(chromosome) # record.start is 1-based, but UCSC annotation starts are 0-based, so subtract 1 record_position = (chromosome, record.start - 1) transcripts = transcript_input.cover_next_position(record_position) # Skip the rest if no transcripts are returned if not transcripts: yield str(record) continue # otherwise, cycle through nonsyn_inferences = [] splice_inferences = [] ucsc_transcripts = [] is_nonsynonymous = is_splice = False for data in transcripts: # need to make "d" match up with transcript file order # d : geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds # 0, 3, 6, 7, 9, 10 d = (data[0], data[3], int(data[6]), int(data[7]), data[9], data[10]) i = infer_function(twobit_file, record, *d) if i[0] == "nonsynonymous coding": nonsyn_inferences.append("%s %s" % (d[0], i[2])) is_nonsynonymous = True ucsc_transcripts.append(data[1]) elif i[0] == "splice site": splice_inferences.append("%s %s " % (d[0], i[2])) is_splice = True # set the attribute if we can if (not is_nonsynonymous) and (not is_splice): yield str(record) else: if len(nonsyn_inferences) > 0: unique_inferences = unique(nonsyn_inferences) unique_inferences.sort(key=str.lower) record.attributes["amino_acid"] = "/".join(unique_inferences) record.attributes["ucsc_trans"] = ",".join(ucsc_transcripts) if len(splice_inferences) > 0: # Not going to report splice sites for now, but leaving the # code here because we hope to later. - Madeleine 2010/11/29 pass # unique_inferences = unique(splice_inferences) # unique_inferences.sort(key=str.lower) # record.attributes["splice"] = "/".join(unique_inferences) yield str(record)
def main(): # return if we don't have the correct arguments if len(sys.argv) < 2: raise SystemExit(__doc__.replace("%prog", sys.argv[0])) # try opening the database connection; fail if unable to open try: dbsnp_connection = MySQLdb.connect(host=DB_HOST, user=DBSNP_USER, passwd=DBSNP_PASSWD, db=DBSNP_DATABASE) dbsnp_cursor = dbsnp_connection.cursor() except MySQLdb.OperationalError, message: print "Error %d while connecting to database: %s" % (message[0], message[1]) sys.exit() # now read the file and loop through f = gff.input(sys.argv[1]) for record in f: # the database shows unplaced SNPs as having 0-based position 0 # (i.e. 1-based position 1), so looking up position 1 would be # unfortunate if record.start == 1: print record continue if record.seqname.startswith("chr"): chr = record.seqname[3:] else: chr = record.seqname # recall that record.start is 1-based, but the database is not dbsnp_cursor.execute(dbsnp_query, (chr, record.start - 1))
def main(): # return if we don't have the correct arguments if len(sys.argv) < 2: raise SystemExit(__doc__.replace("%prog", sys.argv[0])) gff_file = gff.input(sys.argv[1]) for record in gff_file: # lightly parse alleles alleles = record.attributes["alleles"].strip("\"").split("/") ref_allele = record.attributes["ref_allele"].strip("\"") # compress identical alleles like "A/A" into just "A" while len(alleles) > 1 and alleles[0].upper() == alleles[1].upper(): alleles.pop(0) trait_allele = None; # determine zygosity if len(alleles) == 1: zygosity = "hom" trait_allele = alleles[0] else: zygosity = "het" genotype = "/".join(alleles) if ref_allele in alleles: leftover_alleles = copy(alleles) leftover_alleles.remove(ref_allele) genotype = ref_allele + "/" + "/".join(leftover_alleles) if not trait_allele and len(leftover_alleles) == 1: trait_allele = leftover_alleles[0] # examine each amino acid change amino_acid_changes = record.attributes["amino_acid"].strip("\"").split("/") for a in amino_acid_changes: amino_acid = a.split(" ") gene = amino_acid.pop(0) # the first item is always the gene name aa_done = {} for amino_acid_change_and_position in amino_acid: if amino_acid_change_and_position in aa_done: continue aa_done[amino_acid_change_and_position] = 1 if record.start == record.end: coordinates = str(record.start) else: coordinates = str(record.start) + "-" + str(record.end) output = { "chromosome": record.seqname, "coordinates": coordinates, "gene": gene, "amino_acid_change": amino_acid_change_and_position, "genotype": genotype, "ref_allele": ref_allele, "trait_allele": trait_allele, "zygosity": zygosity, "variant": str(record), } print json.dumps(output)
def report_uncovered(gff_input, transcript_filename, genetests_filename, output_file=None, progresstracker=None): """Compare GFF records to transcripts to find missing coding regions Reports missing regions, yielding JSON-formatted strings. If output_file is provided, instead yields the GFF-formatted strings from gff_input and writes the JSON-formatted report strings to file. Required arguments: gff_input: GFF-formatted strings, string generator or file (can be .gz) transcript_filename: transcripts file genetests_filename: genetests file Optional arguments: output_file: If provided, opens and writes to this location (see above) progresstracker: If provided, records metadata to progresstracker.metadata """ # Set up GFF input. If it ends with '.gz', assume gzip compressed. if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)): gff_data = gff.input(gzip.open(gff_input)) else: gff_data = gff.input(gff_input) # set up transcript file input transcript_input = transcript.Transcript_file(transcript_filename) # grab genetests gene names genetests_input = open(genetests_filename) genetests_names = set() for line in genetests_input: if (re.match("#", line)): continue data = line.split("\t") if data[4] == "na": continue if not (re.match(".*Clinical", data[5])): # currently we require "clinical testing available" continue names = data[4].split("|") for name in names: genetests_names.add(name) # Set up optional output. f_out = False if output_file: if re.match(r'\.gz$', output_file): f_out = gzip.open(output_file, 'w') else: f_out = open(output_file, 'w') # If progresstracker was sent, track these for metadata. if progresstracker: progresstracker.metadata['ref_coding_n'] = 0 progresstracker.metadata['ref_coding_clintest_n'] = 0 progresstracker.metadata['called_coding_n'] = 0 progresstracker.metadata['called_coding_clintest_n'] = 0 # Store to-be-examined regions, we'll remove covered regions from this list. # key: Transcript object # value: list of tuples (chr (string), start (int), end (int)) # Note: Start is 1-based, not 0-based as is in transcript files examined_regions = {} for record in gff_data: if f_out: yield str(record) # Move forward in transcripts until past record end. chromosome = std_chr_name(record.seqname) next_region = (chromosome, record.start, record.end) removed_transcripts = transcript_input.cover_next_position(next_region) for curr_ts in transcript_input.transcripts: # Add to examined_regions if new. if (not curr_ts in examined_regions): regions = [] for i in range(len(curr_ts.data["coding_starts"])): region = (curr_ts.data["chr"], (curr_ts.data["coding_starts"][i] + 1), curr_ts.data["coding_ends"][i]) regions.append(region) examined_regions[curr_ts] = regions # Examine regions and remove any covered by the record. curr_ts_regions = examined_regions[curr_ts] examined_regions[curr_ts] = remove_covered(curr_ts_regions, record) # Process past transcripts. results = process_ts_missing(removed_transcripts, examined_regions, genetests_names, progresstracker) for gene_data in results: if gene_data["length"] > 0: if f_out: f_out.write(json.dumps(gene_data) + '\n') else: yield json.dumps(gene_data) # Move through any remaining transcripts and return missing. beyond_end_hack = ("chrZ", 9999999999) removed_transcripts = transcript_input.cover_next_position(beyond_end_hack) remaining_transcripts = removed_transcripts + transcript_input.transcripts results = process_ts_missing(remaining_transcripts, examined_regions, genetests_names, progresstracker) for gene_data in results: if gene_data["length"] > 0: if f_out: f_out.write(json.dumps(gene_data) + '\n') else: yield json.dumps(gene_data)
def main(): # parse options option, args = doc_optparse.parse(__doc__) if len(args) < 2: doc_optparse.exit() gff_files_1 = glob.glob(args[0]) gff_files_2 = glob.glob(args[1]) # create temporary files to store intersections temp_file_1 = TemporaryFile() temp_file_2 = TemporaryFile() if not option.enumerate: # use a wider column if we're going to need it if option.read_depth: col_width = 24 elif option.verbose: col_width = 16 else: col_width = 8 # print column headings print " " * 8, for i in range(1, len(gff_files_1) + 1): print excel_column(i).ljust(col_width), print "" # initialize counter to print row headings file_number = 0 # iterate through the second list of files for g2_path in gff_files_2: # print row heading if not option.enumerate: file_number += 1 print str(file_number).ljust(8), # now iterate through the first list, do intersections and compare for g1_path in gff_files_1: # do the intersection one way g1 = gff.input(g1_path) g2 = gff.input(g2_path) for line in g1.intersect(g2): print >> temp_file_1, line # now do the intersection the other way g1_reverse = gff.input(g1_path) g2_reverse = gff.input(g2_path) for line in g2_reverse.intersect(g1_reverse): print >> temp_file_2, line # rewind each temporary file now storing intersection data temp_file_1.seek(0) temp_file_2.seek(0) # now go through the temporary files and work out concordancy g1_intx = gff.input(temp_file_1) g2_intx = gff.input(temp_file_2) matching_count = unmatching_count = 0 # we cannot chain equal signs here, because the two would reference the # same list, and that would be bad... matching_read_depths, unmatching_read_depths = [], [] for record1 in g1_intx: record2 = g2_intx.next() # these records should match in terms of the interval they represent if record2.seqname != record1.seqname or \ record2.start != record1.start or \ record2.end != record1.end: raise ValueError("files must be pre-sorted") # isolate the read depth info if we need to if option.read_depth: rd = [] try: rd.append(int(record1.attributes["read_depth"].strip("\""))) except KeyError: pass try: rd.append(int(record2.attributes["read_depth"].strip("\""))) except KeyError: pass # now test if there's concordance try: if sorted(record2.attributes["alleles"].strip("\"").split("/")) != \ sorted(record1.attributes["alleles"].strip("\"").split("/")): unmatching_count += 1 if option.enumerate: record1.attributes["concordant"] = "false" record2.attributes["concordant"] = "false" print record1 print record2 if option.read_depth: unmatching_read_depths.extend(rd) else: matching_count += 1 if option.enumerate: record1.attributes["concordant"] = "true" record2.attributes["concordant"] = "true" print record1 print record2 if option.read_depth: matching_read_depths.extend(rd) # no alleles? not a SNP except KeyError: continue # now we print the result, being mindful of possible zero division problems, etc. if option.enumerate: pass elif option.read_depth: try: a = "%.1f" % mean(matching_read_depths) b = "%.1f" % median(matching_read_depths) except TypeError: a = "--" b = "--" try: c = "%.1f" % mean(unmatching_read_depths) d = "%.1f" % median(unmatching_read_depths) except TypeError: c = "--" d = "--" print ("%s %s : %s %s" % (a, b, c, d)).ljust(col_width), else: try: p = "%.1f%%" % (float(matching_count) / (matching_count + unmatching_count) * 100) except ZeroDivisionError: p = "--" if option.verbose: total_count = unmatching_count + matching_count print ("%s %s/%s" % (p, matching_count, total_count)).ljust(col_width), else: print p.ljust(col_width), # now we rewind, delete everything, and start again! temp_file_1.seek(0) temp_file_1.truncate() temp_file_2.seek(0) temp_file_2.truncate() # wrap up the line print "" # print the legend describing what the column and row headings mean if not option.enumerate: print "-" * 8 file_number = 0 for i in gff_files_1: file_number += 1 print ("[%s]" % excel_column(file_number)).ljust(8), print i file_number = 0 for i in gff_files_2: file_number += 1 print ("[%s]" % file_number).ljust(8), print i
) cursor = connection.cursor() except MySQLdb.OperationalError, message: sys.stderr.write("Error %d while connecting to database: %s" % (message[0], message[1])) sys.exit() # make sure the required table is really there try: cursor.execute("DESCRIBE latest") except MySQLdb.Error: sys.stderr.write("No 'latest' table => empty output") sys.exit() found_aa_for_rsid = dict() gff_file = gff.input(sys.argv[1]) for record in gff_file: # lightly parse to find the alleles and rs number alleles = record.attributes["alleles"].strip('"').split("/") ref_allele = record.attributes["ref_allele"].strip('"') xrefs = () try: xrefs = record.attributes["db_xref"].strip('"').split(",") except KeyError: try: xrefs = record.attributes["Dbxref"].strip('"').split(",") except KeyError: pass # we wouldn't know what to do with this, so pass it up for now if len(alleles) > 2:
def match_getev(gff_in, getev_flat, transcripts_file=None, gene_out_file=None, output_file=None, progresstracker=None): """String generator returning JSON-formatted data from GET-Evidence Required inputs: gff_in: GFF-formated string generator, text, or .gz gzip-compressed getev_flat: JSON-formated text, or .gz gzip-compressed Optional inputs: output_file: if set, print to this & generator instead yields GFF lines progress_tracker: ProgressTracker object from progresstracker.py Each output line yielded is JSON-formatted and corresponds to data for a particular variant. It will always contain 'chr', 'coordinates', 'GET-Evidence', 'genotype', 'autoscore' and at least one of these two possibilities: (1) 'gene' and 'amino_acid_change' or (2) 'dbsnp'. It may also contain 'testable', 'reviewed', and items copied by copy_output_data. """ # Load data from GET-Evidence and Genetests files. getev_by_aa, getev_by_dbsnp = read_getev_flat(getev_flat) genetests_filepath = os.path.join(os.getenv('DATA'), GENETESTS_DATA) genetests_clin, genetests_rev = read_genetests(genetests_filepath) # Set up optional output, will not be compressed. f_json_out = None f_gene_out = None if output_file: f_json_out = open(output_file, 'w') if gene_out_file and transcripts_file: gene_data = dict() f_gene_out = open(gene_out_file, 'w') transcripts = read_transcripts(transcripts_file) # Set up BLOSUM100 matrix to score amino acid disruptiveness. blosum_matrix = blosum100() # Set up GFF data. Can be a string generator, text, or # (if it ends with '.gz') a gzip-compressed text. gff_data = None if isinstance(gff_in, str) and re.search(r'\.gz$', gff_in): gff_data = gff.input(gzip.open(gff_in)) else: gff_data = gff.input(gff_in) header_done = False for record in gff_data: # Have to do this after calling the first record to # get the iterator to read through the header data if (not header_done) and f_json_out: yield "##genome-build " + gff_data.data[1] yield "# File creation date: " + datetime.datetime.now().isoformat( ' ') header_done = True # If outputing JSON to file, yield GFF data as it's read. if f_json_out: yield str(record) # Ignore regions called as matching reference. if record.feature == 'REF': continue # If producing a gene report, output finished genes if f_gene_out: to_remove = [] for gene in gene_data: if not gene in transcripts: # Remove genes we don't recognize to_remove.append(gene) else: if transcripts[gene]['end'] < record.end: gene_report(f_gene_out, gene, gene_data[gene]) to_remove.append(gene) for gene in to_remove: gene_data.pop(gene) # Track progress if a ProgressTracker was passed to us if progresstracker: progresstracker.saw(record.seqname) # Store data for JSON output as dict. output = dict() # Parse GFF attributes to find the alleles, reference allele, phase, and dbSNP. alleles = record.attributes['alleles'].strip('"').split( '/') # don't sort! if len(alleles) == 1: output['genotype'] = alleles[0] elif len(alleles) > 2 or len(alleles) < 1: # Not sure what to do with >2 or 0 alleles! Skip it. continue else: output['genotype'] = '/'.join(sorted(alleles)) ref_allele = record.attributes['ref_allele'].strip('"') output['ref_allele'] = ref_allele if 'phase' in record.attributes: # Add phase attribute for the non-reference allele; # if both non-reference, treat as unphased. phase_data = record.attributes['phase'].strip().split('/') if len(alleles) == 2 and len(phase_data) == 2: if alleles[0] == ref_allele: output['phase'] = phase_data[1] elif alleles[1] == ref_allele: output['phase'] = phase_data[0] dbsnp_ids = [] if 'db_xref' in record.attributes or 'Dbxref' in record.attributes: if 'db_xref' in record.attributes: entries = [ d.strip() for d in record.attributes['db_xref'].split(',') ] else: entries = [ d.strip() for d in record.attributes['Dbxref'].split(',') ] for entry in entries: data = entry.split(':') if re.match('dbsnp', data[0]) and re.match('rs', data[1]): dbsnp_ids.append(data[1]) if dbsnp_ids: output["dbSNP"] = ",".join(dbsnp_ids) # Default presence in GET-Evidence is false, set as true later # if a match is found. output['GET-Evidence'] = False # Store position data output['chromosome'] = record.seqname if record.start == record.end: output['coordinates'] = str(record.start) else: output['coordinates'] = str(record.start) + "-" + str(record.end) aa_changes = [] # If there are any amino acid changes reported, look them up if "amino_acid" in record.attributes: for gene_aa_aa in record.attributes['amino_acid'].split('/'): aas = gene_aa_aa.split() gene = aas.pop(0) aa_seen = {} for aa in aas: if aa in aa_seen: continue aa_seen[aa] = 1 aa_changes.append([gene, aa]) for aa_data in aa_changes: # Get gene and amino acid change, store in output. # Note: parse_aa_change will call sys.exit() if it's misformatted. gene, aa_change_and_pos = aa_data # "X" is preferred for stop, "*" can break things like URLs. aa_change_and_pos = re.sub(r'\*', r'X', aa_change_and_pos) (aa_from, aa_pos, aa_to) = parse_aa_change(aa_change_and_pos) output["gene"] = gene output["amino_acid_change"] = aa_data[1] # Check if the gene is in Genetests. If so, store result. if gene in genetests_clin: output["testable"] = True if gene in genetests_rev: output["reviewed"] = True # Try to look up in GET-Evidence by amino acid change. aa_key = gene + "-" + aa_change_and_pos if aa_key in getev_by_aa: getev_data = getev_by_aa[aa_key] copy_output_data(getev_data, output) output["GET-Evidence"] = True else: # If not in GET-Evidence by aa, try dbsnp ID. if "dbSNP" in output: dbsnp_ids = output["dbSNP"].split(",") for dbsnp_id in dbsnp_ids: if dbsnp_id in getev_by_dbsnp: getev_data = getev_by_dbsnp[dbsnp_id] output["GET-Evidence"] = True copy_output_data(getev_data, output) output["autoscore"] = autoscore( output, blosum_matrix, aa_from, aa_to) output["suff_eval"] = suff_eval(output) output["dbSNP"] = dbsnp_id # Quit after first hit passing threshold if output["autoscore"] >= 2 or output["suff_eval"]: break # Calculate autoscore, if not already done during dbSNP selection process if not ("autoscore" in output): output["autoscore"] = autoscore(output, blosum_matrix, aa_from, aa_to) if output["GET-Evidence"]: output["suff_eval"] = suff_eval(output) # This barfs on Unicode sometimes. try: json_output = str(json.dumps(output, ensure_ascii=False)) except: output['summary_short'] = ( 'Summary for this variant not ' + 'displayed. It may contain a Unicode character ' + 'preventing it from being properly processed.') json_output = str(json.dumps(output, ensure_ascii=False)) if f_json_out: f_json_out.write(json_output + '\n') else: yield json_output # TODO: print when beyond end of gene, not when new one seen if f_gene_out and 'ucsc_trans' in record.attributes: # We take 1st & ignore multiple transcripts (which are rare) gene = record.attributes['ucsc_trans'].split(',')[0] if gene in gene_data: gene_data[gene].append(output) else: gene_data[gene] = [output] if len(aa_changes) == 0: # If no gene data at all, try dbsnp ID. if "dbSNP" in output: dbsnp_ids = output["dbSNP"].split(",") for dbsnp_id in dbsnp_ids: if dbsnp_id in getev_by_dbsnp: output["GET-Evidence"] = True getev_data = getev_by_dbsnp[dbsnp_id] copy_output_data(getev_data, output) output["autoscore"] = autoscore(output) output["suff_eval"] = suff_eval(output) output["dbSNP"] = dbsnp_id # Quit after first hit passing threshold if output["autoscore"] >= 2 or output["suff_eval"]: break # If no gene data and dbSNP id is not listed in # GET-Evidence, don't output. if "autoscore" in output: # This barfs on Unicode sometimes. try: json_output = str(json.dumps(output, ensure_ascii=False)) except: continue if f_json_out: f_json_out.write(json_output + '\n') else: yield json_output if f_json_out: f_json_out.close() if f_gene_out: f_gene_out.close()
def main(): # parse options option, args = doc_optparse.parse(__doc__) if len(args) < 2: doc_optparse.exit() flank = int(option.flank or 0) # try opening the file both ways, in case the arguments got confused try: gff_file = gff.input(args[1]) twobit_file = twobit.input(args[0]) except Exception: gff_file = gff.input(args[0]) twobit_file = twobit.input(args[1]) # initialize a set of variables to keep track of uniqueness, if we need them if option.unique: previous_record = None previous_ref_seq = None repetition_count = 1 for record in gff_file: # if we're using the unique option, output the previous record only when # we're sure we've seen all repetitions of it if option.unique and record == previous_record: repetition_count += 1 continue elif option.unique: if previous_record: previous_record.attributes["repetition_count"] = str(repetition_count) print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq) repetition_count = 1 previous_record = record if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname ref_seq = twobit_file[chr][(record.start - 1):record.end] if flank != 0: # calculate the flanks (these variables are 0-based) left_flank_start = record.start - flank - 1 left_flank_end = record.start - 1 if left_flank_start < 0: left_flank_start = 0 right_flank_start = record.end right_flank_end = record.end + flank # now find them left_flank_seq = twobit_file[chr][left_flank_start:left_flank_end] right_flank_seq = twobit_file[chr][right_flank_start:right_flank_end] ref_seq = left_flank_seq + "\n\n" + ref_seq + "\n\n" + right_flank_seq if option.strand and record.strand == "-": ref_seq = reverse_complement(ref_seq) # we don't output the current record if we're using the unique option if option.unique: previous_ref_seq = ref_seq else: print FastaRecord(str(record).replace("\t", "|"), ref_seq) # we'll have one last record yet to output if we used the unique option if option.unique: previous_record.attributes["repetition_count"] = str(repetition_count) print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)