def load_annos(args): """ Populate a dictionary of Tabixfile handles for each annotation file. Other modules can then access a given handle and fetch data from it as follows: dbsnp_handle = annotations.annos['dbsnp'] hits = dbsnp_handle.fetch(chrom, start, end) """ anno_files = get_anno_files(args) for anno in anno_files: try: # .gz denotes Tabix files. if anno_files[anno].endswith(".gz"): if anno == "clinvar": annos[anno] = pysam.Tabixfile(anno_files[anno], encoding='utf8') else: annos[anno] = pysam.Tabixfile(anno_files[anno]) # .bw denotes BigWig files. elif anno_files[anno].endswith(".bw"): from bx.bbi.bigwig_file import BigWigFile annos[anno] = BigWigFile(open(anno_files[anno])) except IOError: raise IOError("Gemini cannot open this annotation file: %s. \n" "Have you installed the annotation files? If so, " "have they been moved or deleted? Exiting...\n\n" "For more details:\n\t" "http://gemini.readthedocs.org/en/latest/content/" "#installation.html\#installing-annotation-files\n" % anno_files[anno])
def __init__(self, src_file, title='COSMIC', version=None, gpp_tabix_file=None): self.title = title super(Cosmic, self).__init__(src_file, title=self.title, version=version) if gpp_tabix_file is None: raise ValueError( "A second index by gene protein position must be specified.") self.db_genomePos = pysam.Tabixfile(src_file) header = self.db_genomePos.header.next() self.src_headers = header.lstrip('#').strip().split('\t') self.db_geneProteinPos = pysam.Tabixfile(gpp_tabix_file) gppHeader = self.db_geneProteinPos.header.next() self.gpp_headers = gppHeader.lstrip('#').strip().split('\t') self.output_headers = [ 'COSMIC_n_overlapping_mutations', 'COSMIC_overlapping_mutation_AAs', 'COSMIC_overlapping_mutation_descriptions', 'COSMIC_overlapping_primary_sites' ] self.logger = logging.getLogger(__name__)
def testFetchAll(self): remote_file = pysam.Tabixfile(self.url, "r") remote_result = list(remote_file.fetch()) local_file = pysam.Tabixfile(self.local, "r") local_result = list(local_file.fetch()) self.assertEqual(len(remote_result), len(local_result)) for x, y in zip(remote_result, local_result): self.assertEqual(x, y)
def __init__(self, format_fn, exomes=True): self.exomes = exomes self.tabix_files = {} if exomes: self.tabix_files['_'] = pysam.Tabixfile(format_fn) else: for chrom in map(str, range(1, 23)) + ['X']: self.tabix_files[chrom] = pysam.Tabixfile( format_fn.format(chrom))
def __init__(self): ref_path = '/stor/work/Lambowitz/ref/hg19_ref/genes' exons = ref_path + '/gencode.exon.bed.gz' self.exons = pysam.Tabixfile(exons) transcriptom_peaks = project_path + '/transcriptome/macs2/unfragmented.fwd_peaks_genomics.narrowPeak.gz' self.transcriptome_peaks = pysam.Tabixfile(transcriptom_peaks) self.bam = pysam.Samfile( project_path + '/merged_bam/dedup/unfragmented.chrM_filter.dedup.bam') self.bed = pysam.Tabixfile(project_path + '/bed_files/merged_bed/unfragmented.bed.gz')
def __init__(self, vcffile=None): self.vcffile = vcffile self.filename = os.path.splitext(os.path.basename(str(vcffile)))[0] self.header = open('%s/scripts/header.vcf' % (settings.BASE_DIR)).readlines() # create folder merge if it doesn't exists if not os.path.exists('merge'): os.makedirs('merge') # enter inside folder os.chdir('merge') self.annotation_files = OrderedDict() pysam.tabix_index('../snpeff/snpeff.output.vcf', preset='vcf') self.annotation_files['snpeff'] = { 'info': 'EFF', 'file': pysam.Tabixfile('../snpeff/snpeff.output.vcf.gz', 'r', encoding="utf-8") } pysam.tabix_index('../vep/vep.output.sorted.vcf', preset='vcf') self.annotation_files['vep'] = { 'info': 'CSQ', 'file': pysam.Tabixfile('../vep/vep.output.sorted.vcf.gz', 'r', encoding="utf-8") } pysam.tabix_index('../snpsift/snpsift.final.vcf', preset='vcf') self.annotation_files['vartype'] = { 'info': 'VARTYPE,SNP,MNP,INS,DEL,MIXED,HOM,HET', 'file': pysam.Tabixfile('../snpsift/snpsift.final.vcf.gz', 'r', encoding="utf-8") } self.dbsnp = pysam.Tabixfile(settings.dbsnp, 'r', encoding="utf-8")
def testFetchAll(self): shutil.copyfile(self.filename_src, self.filename_dst) shutil.copyfile(self.index_src, self.index_dst) same_basename_file = pysam.Tabixfile(self.filename_src, "r", index=self.index_src) same_basename_results = list(same_basename_file.fetch()) diff_index_file = pysam.Tabixfile(self.filename_dst, "r", index=self.index_dst) diff_index_result = list(diff_index_file.fetch()) self.assertEqual(len(same_basename_results), len(diff_index_result)) for x, y in zip(same_basename_results, diff_index_result): self.assertEqual(x, y)
def __init__(self, fname=None, fileobj=None, region=None): self._bins = {} self._bin_list = [] self._cur_bin_idx = 0 self._cur_bin_pos = 0 self._tellpos = 0 self._total = 0 self._length = 0 self.__tabix = None self.filename = fname if os.path.exists('%s.tbi' % fname): self.__tabix = pysam.Tabixfile(fname) if fileobj: self.__readfile(fileobj) elif fname: with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj: self.__readfile(fobj) elif region: chrom, startend = region.split(':') if '-' in startend: start, end = [int(x) for x in startend.split('-')] else: start = int(startend) end = start start -= 1 self.__add_region(BedRegion(chrom, start, end)) else: raise ValueError( "Must specify either filename, fileobj, or region")
def plot_genes(self, fn_genes): #genes = regions_from_bed(fn_genes,names=True) #locs,vals = genes.get_locations_over_interval(self.chr,self.start,self.end) max_loc = -1 y = 0 last_start, last_end = 0, 0 #for i in xrange(locs.shape[0]): count = 0 max_count = 10 #for gene_line in tabix.Tabix(fn_genes).fetch(self.chr,self.start,self.end): tbx_genes = pysam.Tabixfile(fn_genes) for gene_line in tbx_genes.fetch(self.chr, self.start, self.end): sline = gene_line.split() start, end, name = int(sline[4]), int(sline[5]), sline[12] if start == last_start and end == last_end: continue #y=start<max_loc and y-1.5 or 0 if count == max_count: y = 0 count = 0 else: y -= 1 count += 1 f_size = 8 max_loc = max(max_loc, end + f_size * 1.1 * len(name)) self.gene_ax.plot([start, end], [y, y], 'g', linewidth=4, alpha=.6) self.gene_ax.annotate(name, (end + 100, y), fontsize=f_size, horizontalalignment='left') last_start, last_end = start, end
def getphastcons(kmerpos, phastconsbed): #kmerpos = {} # {age : {location : [[chrm, kmerstart, kmerstop, strand]]}} phastconsdict = { } # {age : {location : {binnumber : [phastconsvalue1, phastconsvalue2, ...]}}} #Bin1 is 100 bp upstream of kmerstart. For a 4mer, the kmer would be bins 101-104, and bins 105-205 would be 100 bp downstream of kmerstop. phastconstabix = pysam.Tabixfile(phastconsbed) for age in kmerpos: phastconsdict[age] = {} for location in kmerpos[age]: phastconsdict[age][location] = {} for kmer in kmerpos[age][location]: chrm = kmer[0] kmerstart = int(kmer[1]) kmerstop = int(kmer[2]) strand = kmer[3] phastconsscores = {} # {windowbin : score} if strand == '+': windowstart = kmerstart - 100 windowend = kmerstop + 100 try: for bed in phastconstabix.fetch(chrm, windowstart, windowend, parser=pysam.asBed()): windowbin = str(int(bed.start) - windowstart) phastconsscore = float(bed.name) phastconsscores[windowbin] = phastconsscore except ValueError: print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format( str(chrm), str(kmerstart), str(kmerstop), strand) elif strand == '-': windowstart = kmerstart - 100 windowend = kmerstop + 100 try: for bed in phastconstabix.fetch(chrm, windowstart, windowend, parser=pysam.asBed()): windowbin = str(windowend - int(bed.start)) phastconsscore = float(bed.name) phastconsscores[windowbin] = phastconsscore except ValueError: print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format( str(chrm), str(kmerstart), str(kmerstop), strand) if len( phastconsscores ) > 0: #if there were any bases in the UTR that had phastcons scores for windowbin in phastconsscores: if phastconsdict[age][location].has_key( windowbin) == False: phastconsdict[age][location][windowbin] = [ phastconsscores[windowbin] ] elif phastconsdict[age][location].has_key(windowbin): phastconsdict[age][location][windowbin].append( phastconsscores[windowbin]) return phastconsdict
def annotate_variants_bool(args, select_cursor, update_cursor): """ Populate a new, user-defined column in the variants table with a BOOLEAN indicating whether or not overlaps were detected between the variant and the annotation file. """ add_requested_column(args.col_name, update_cursor) # For each, use Tabix to detect overlaps with the user-defined # annotation file. Update the variant row with T/F if overlaps found. annos = pysam.Tabixfile(args.anno_file) select_cursor.execute("SELECT chrom, start, end, variant_id FROM variants") for row in select_cursor: has_hit = False for hit in annos.fetch(str(row['chrom']), int(row['start']), int(row['end'])): has_hit = True break if has_hit: update_qry = "UPDATE variants SET " + args.col_name + " = 1 " + \ " WHERE variant_id = " + str(row['variant_id']) else: update_qry = "UPDATE variants SET " + args.col_name + " = 0 " + \ " WHERE variant_id = " + str(row['variant_id']) update_cursor.execute(update_qry)
def get(self, chrom, position, ref, alt): if self.has_chr_prefix and not chrom.startswith('chr'): chrom = 'chr' + chrom elif not self.has_chr_prefix and chrom.startswith('chr'): chrom = chrom[3:] if not self.overlaps(chrom, position): self.chrom = chrom self.start = position self.stop = position + self.step_bp self.data = dict() for f in self.files: with pysam.Tabixfile(f, 'r') as tabix: for row in tabix.fetch(self.chrom, self.start - 1, self.stop + 1, parser=pysam.asTuple()): name = ':'.join(row[:4]) cadd_raw, cadd_phred = map(float, row[4:6]) if name in self.data: if self.data[name][1] < cadd_phred: self.data[name] = (cadd_raw, cadd_phred) else: self.data[name] = (cadd_raw, cadd_phred) return self.data.get(':'.join((chrom, str(position), ref, alt)), (None, None))
def testBasicFetching(self): '''Test fetching records with only a gene protein position from an existing tabix file.''' genePPFilename = "testdata/small_cosmic_with_gp_and_gpp/small_cosmic_trimmed_for_sorting.txt.tbi.byAA.sorted.tsv.gz" tabixFile = pysam.Tabixfile(genePPFilename) headers = [ "Gene_name", "HGNC_ID", "Sample_name", "Primary_site", "Site_subtype", "Primary_histology", "Histology_subtype", "Mutation_ID", "Mutation_CDS", "Mutation_AA", "Mutation_Description", "Mutation_zygosity", "Mutation_NCBI36_genome_position", "Mutation_GRCh37_genome_position", "Pubmed_PMID", "startAA", "endAA" ] results = tabixFile.fetch(reference="EGFR", start=748, end=749) resultDicts = [] for result in results: resultDicts.append(dict(zip(headers, result.strip().split('\t')))) self.assertTrue( len(resultDicts) == 2, "Should have only had two entries, but found: " + str(resultDicts)) #A1BG 5 ME024T NS NS malignant_melanoma NS 226401 c.1132G>A p.D378N Substitution - Missense unk 19:58861796-58861796 22622578 results = tabixFile.fetch(reference="A1BG", start=377, end=378) resultDicts = [] for result in results: resultDicts.append(dict(zip(headers, result.strip().split('\t')))) self.assertTrue( len(resultDicts) == 1, "Should have only had one entry, but found: " + str(resultDicts))
def load_chromosome(db, chromosome): ''' load genome reference ''' import pysam db_dict = defaultdict(list) f = pysam.Tabixfile(db) for line in f.fetch(chromosome): chrom, pos = line.split('\t')[0:2] # resolve position when record is a deletion if len(line.split()[3]) > len(line.split()[4]): pos = resolve_db_position(args.input_type, int(pos)) chrom.replace('chr', '') key = ','.join([chrom, str(pos)]) if args.flag_with_id: value = line.split('\t')[2] else: value = 'T' if value not in db_dict[key]: db_dict[key].append(value) f.close() return db_dict
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--genotype", help="Tabix indexed pileup file.", required=True) parser.add_argument("--intervals", help="BED file.", required=True) parser.add_argument("--padding", type=int, default=10, help="Number of bases to expand intervals, when " "filtering based on adjacent indels [%default]") parser.add_argument("--min-distance-to-indels", type=int, default=5, help="Variants closer than this distance from indels " "are filtered [%default].") args = parser.parse_args(argv) genotype = pysam.Tabixfile(args.genotype) with open(args.intervals) as bed_file: intervals = text.parse_lines_by_contig(bed_file, pysam.asBed()) for (_, beds) in sorted(intervals.items()): for (name, sequence) in build_genes(args, genotype, beds): FASTA(name, None, sequence).write(sys.stdout) return 0
def setUp(self): self.tabix = pysam.Tabixfile(self.filename) self.compare = [ x[:-1].split("\t") for x in gzip.open(self.filename, "r") if not x.startswith("#") ]
def coverage(bam_paths, gtf_path, transcript_ids=None, verbose=False, agg_func=None): # Setup record iterator from gtf file. gtf_file = pysam.Tabixfile(gtf_path, parser=pysam.asGTF()) gtf_records = (rec for rec in gtf_file.fetch() if rec.feature == 'exon') if transcript_ids is not None: transcript_ids = set(transcript_ids) gtf_records = (rec for rec in gtf_records if rec['transcript_id'] in transcript_ids) if verbose: gtf_records = tqdm(gtf_records, leave=False) # Build frame. rows = _coverage_gen(bam_paths, gtf_records, agg_func=agg_func) index_names = ['transcript_id', 'chr', 'start', 'end', 'strand'] result = pd.DataFrame.from_records(rows, columns=index_names + list(bam_paths)) result = result.set_index(index_names) return result
def parse_tabix_file_subset(tabix_filenames, subset_i, subset_n, record_parser): """ Returns a generator of parsed record objects (as returned by record_parser) for the i'th out n subset of records across all the given tabix_file(s). The records are split by files and contigs within files, with 1/n of all contigs from all files being assigned to this the i'th subset. Args: tabix_filenames: a list of one or more tabix-indexed files. These will be opened using pysam.Tabixfile subset_i: zero-based number subset_n: total number of subsets record_parser: a function that takes a file-like object and returns a generator of parsed records """ start_time = time.time() open_tabix_files = [pysam.Tabixfile(tabix_filename) for tabix_filename in tabix_filenames] tabix_file_contig_pairs = [(tabix_file, contig) for tabix_file in open_tabix_files for contig in tabix_file.contigs] tabix_file_contig_subset = tabix_file_contig_pairs[subset_i : : subset_n] # get every n'th tabix_file/contig pair short_filenames = ", ".join(map(os.path.basename, tabix_filenames)) num_file_contig_pairs = len(tabix_file_contig_subset) print(("Loading subset %(subset_i)s of %(subset_n)s total: %(num_file_contig_pairs)s contigs from " "%(short_filenames)s") % locals()) counter = 0 for tabix_file, contig in tabix_file_contig_subset: header_iterator = tabix_file.header records_iterator = tabix_file.fetch(contig, 0, 10**9, multiple_iterators=True) for parsed_record in record_parser(itertools.chain(header_iterator, records_iterator)): counter += 1 yield parsed_record if counter % 100000 == 0: seconds_elapsed = int(time.time()-start_time) print(("Loaded %(counter)s records from subset %(subset_i)s of %(subset_n)s from %(short_filenames)s " "(%(seconds_elapsed)s seconds)") % locals()) print("Finished loading subset %(subset_i)s from %(short_filenames)s (%(counter)s records)" % locals())
def __init__(self, tabix_file_name,**dict): ''' wrapped in DBI.init(filename,"tabix") ''' self.tabix_file_name=tabix_file_name self.dict=dict try: self.data=pysam.Tabixfile(tabix_file_name) except: print >>sys.stderr,"WARNING: Can't init the tabix file",tabix_file_name self.header=None if dict.has_key("header") and dict["header"]==True: f=TableIO.parse(tabix_file_name) h=f.next() l=len(h) for i in range(l): h[i]=h[i].strip() self.header=h f.close() elif dict.has_key("header") and isinstance(dict["header"],list): self.header=dict["header"] elif dict.has_key("header") and isinstance(dict["header"],str): fh=TableIO.parse(dict["header"]) self.header=fh.next() #print >>sys.stderr,self.header self.tabix_format="simple" if self.dict.has_key("tabix"): self.tabix_format=self.dict["tabix"]
def _annotate_variants(args, conn, metadata, get_val_fn, col_names=None, col_types=None, col_ops=None): """Generalized annotation of variants with a new column. get_val_fn takes a list of annotations in a region and returns the value for that region to update the database with. Separates selection and identification of values from update, to avoid concurrent database access errors from sqlite, especially on NFS systems. The retained to_update list is small, but batching could help if memory issues emerge. """ # For each, use Tabix to detect overlaps with the user-defined # annotation file. Update the variant row with T/F if overlaps found. anno = pysam.Tabixfile(args.anno_file) naming = guess_contig_naming(anno) cursor = conn.bind.connect() add_requested_columns(args, cursor, col_names, col_types) conn.commit() cursor.close() conn, metadata = database.get_session_metadata(str(conn.bind.url)) cursor = conn.bind.connect() last_id = 0 current_id = 0 total = 0 CHUNK_SIZE = 100000 to_update = [] select_res = cursor.execution_options(stream_results=True).execute('''SELECT chrom, start, end, ref, alt, variant_id FROM variants''') while True: for row in select_res.fetchmany(CHUNK_SIZE): # update_data starts out as a list of the values that should # be used to populate the new columns for the current row. # Prefer no pysam parsing over tuple parsing to work around bug in pysam 0.8.0 # https://github.com/pysam-developers/pysam/pull/44 if args.anno_file.endswith(('.vcf', '.vcf.gz')): update_data = get_val_fn(annotations_in_vcf(row, anno, None, naming, args.region_only, True)) else: update_data = get_val_fn(annotations_in_region(row, anno, None, naming)) #update_data = get_val_fn(annotations_in_region(row, anno, "tuple", naming)) # were there any hits for this row? if len(update_data) > 0: # we add the primary key to update_data for the # where clause in the SQL UPDATE statement. update_data.append(str(row["variant_id"])) to_update.append(tuple(update_data)) current_id = row["variant_id"] if current_id <= last_id: break elif len(to_update) > 0: _update_variants(metadata, to_update, col_names, cursor) total += len(to_update) print("updated", total, "variants") last_id = current_id to_update = []
def annotate_variants_list(args, select_cursor, update_cursor): """ Populate a new, user-defined column in the variants table with a INTEGER indicating the count of overlaps between the variant and the annotation file. """ add_requested_column(args.col_name, update_cursor) # For each, use Tabix to count overlaps with the user-defined # annotation file. Update the variant row with the count. annos = pysam.Tabixfile(args.anno_file) select_cursor.execute("SELECT chrom, start, end, variant_id FROM variants") for row in select_cursor: hit_list = [] for hit in annos.fetch(str(row['chrom']), int(row['start']), int(row['end']), parser=pysam.asTuple()): try: hit_list.append(hit[int(args.col_extract) - 1]) except IndexError: sys.exit("Column " + args.col_extract + " exceeds \ the number of columns in your \ annotation file. Exiting.") hits = ",".join(hit_list) if len(hit_list): update_qry = "UPDATE variants SET " + args.col_name + " = '" + hits + \ "' WHERE variant_id = " + str(row['variant_id']) else: update_qry = "UPDATE variants SET " + args.col_name + " = NULL" + \ " WHERE variant_id = " + str(row['variant_id']) update_cursor.execute(update_qry)
def SNP_chunhe(inputItem): global SNPFileName snpMAFvalue = [] #该CBS区域内对应上得snp位点的MAF值 snpNum = 0 #所有snp点的个数 MAFNum = 0 #MAF值大于等于0.8的个数 MAFratio = 0 #MAFNum/snpNum tb = pysam.Tabixfile(SNPFileName) for item in tb.fetch(inputItem[1], int(inputItem[2]), int(inputItem[3])): snplist = item.strip().split("\t") if int(snplist[1]) in range(int(inputItem[2]), int(inputItem[3]) + 1): snpMAFvalue.append(snplist[16]) #在pup文件中第17列是MAF的值 if float(snplist[16]) >= 0.8: MAFNum += 1 snpNum = len(snpMAFvalue) if snpNum == 0: MAFvaluemean = 0 snpNum = 0 inputItem.append(str(0)) #增加MAF-value-mean inputItem.append(str(0)) #增加All-SNP-num inputItem.append("None") #增加MAF-value-num(>=0.8) inputItem.append("None") #增加MAF-ratio(=MAF-value-num/All-SNP-num) else: MAFvalue = sum(map(lambda x: float(x), snpMAFvalue)) #MAF所有值得和 MAFvaluemean = round(MAFvalue / snpNum, 2) #MAF均值 MAFratio = round(MAFNum / snpNum, 2) #MAF值大于等于0.8得占比 inputItem.append(str(MAFvaluemean)) #增加MAF-value-mean inputItem.append(str(snpNum)) #增加All-SNP-num inputItem.append(str(MAFNum)) #增加MAF-value-num(>=0.8) inputItem.append( str(MAFratio)) #增加MAF-ratio(=MAF-value-num/All-SNP-num) # print inputItem return inputItem
def ensureIndexed(bedPath, preset="bed", trySorting=True): if not bedPath.endswith(".gz"): if not os.path.exists(bedPath + ".gz"): logging.info("bgzf compressing {}".format(bedPath)) pysam.tabix_compress(bedPath, bedPath + ".gz") if not os.path.exists(bedPath + ".gz"): raise Exception( "Failed to create compress {preset} file for {file}; make sure the {preset} file is " "sorted and the directory is writeable".format( preset=preset, file=bedPath)) bedPath += ".gz" if not os.path.exists(bedPath + ".tbi"): logging.info("creating tabix index for {}".format(bedPath)) pysam.tabix_index(bedPath, preset=preset) if not os.path.exists(bedPath + ".tbi"): raise Exception( "Failed to create tabix index file for {file}; make sure the {preset} file is " "sorted and the directory is writeable".format(preset=preset, file=bedPath)) line = pysam.Tabixfile(bedPath).fetch().next() if len(line.strip().split("\t")) < 6 and preset == "bed": raise AnnotationError( "BED files need to have at least 6 (tab-delimited) fields (including " "chrom, start, end, name, score, strand; score is unused)") if len(line.strip().split("\t")) < 9 and preset == "gff": raise AnnotationError( "GFF/GTF files need to have at least 9 tab-delimited fields") return bedPath
def load_contig(contig): '''save cadd contig into mongodb collection. should be an iterable. ''' # if CADD_INPUT == "exome": # CADD_INPUT = exome tabix = pysam.Tabixfile(whole_genome) src_db = get_src_db() target_coll = src_db["cadd"] t0 = time.time() cnt = 0 docs = (doc for doc in fetch_generator(tabix, contig)) doc_list = [] for doc in docs: doc_list.append(doc) cnt += 1 if len(doc_list) == 100: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) doc_list = [] if cnt % 100000 == 0: print(cnt, timesofar(t0)) if doc_list: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) print("successfully loaded cadd chromosome %s into mongodb" % contig) print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
def main(args): tbx = pysam.Tabixfile(args.tabix) with open(args.table) as l1seq: for line in l1seq: if line.startswith('UUID'): header = line.strip().split() header.append(args.name) print '\t'.join(header) continue c = line.strip().split() chrom = c[1] start = int(c[2]) end = int(c[3]) annotations = [] if chrom in tbx.contigs: for rec in tbx.fetch(chrom, start, end): if args.nonref: sfam = c[6] annot = rec.strip().split() if annot[3] == sfam: annotations.append('|'.join(annot)) else: annotations.append('|'.join(rec.strip().split())) annotations = list(set(annotations)) # uniqify if len(annotations) == 0: annotations.append('NA') print line.strip() + '\t' + ','.join(annotations)
def main(argv): parser = argparse.ArgumentParser(prog="paleomix vcf_to_fasta") parser.add_argument("--genotype", help="Tabix indexed VCF file.", required=True) parser.add_argument("--intervals", help="BED file.", required=True) parser.add_argument("--padding", type=int, default=10, help="Number of bases to expand intervals, when " "checking for adjacent indels [%default]") parser.add_argument("--whole-codon-indels-only", action="store_true", default=False, help="If true, only indels where (length % 3) == 0 " "are retained [%default]") parser.add_argument("--ignore-indels", action="store_true", default=False, help="Do not include indels generated FASTA " "sequence [%default].") opts = parser.parse_args(argv) print("Running buildRegions.py", end="", file=sys.stderr) if opts.whole_codon_indels_only: print(", assuming sequences represents CDS", end="", file=sys.stderr) print(" ...", file=sys.stderr) genotype = pysam.Tabixfile(opts.genotype) intervals = read_intervals(opts.intervals) if intervals is None: return 1 return genotype_genes(opts, intervals, genotype)
def fetch(self, chrom, start, end=None): """ fetch records from a Tabix indexed VCF, requires pysam if start and end are specified, return iterator over positions if end not specified, return individual ``_Call`` at start or None """ if not pysam: raise Exception('pysam not available, try "pip install pysam"?') if not self.filename: raise Exception('Please provide a filename (or a "normal" fsock)') if not self._tabix: self._tabix = pysam.Tabixfile(self.filename) if self._prepend_chr and chrom[:3] == 'chr': chrom = chrom[3:] # not sure why tabix needs position -1 start = start - 1 if end is None: self.reader = self._tabix.fetch(chrom, start, start + 1) try: return self.next() except StopIteration: return None self.reader = self._tabix.fetch(chrom, start, end) return self
class VcfReader: """ Read comfortably from VCF style files with main focus on chr, start, ref and alt fields. Note that this API uses 1-based coordinates with both start and end included in the interval. PySam API uses 0-based half-open intervals, so we have to convert internally. """ def __init__(self, input_file): self.filename = input_file self.indexed = False if input_file.strip() == "-": ifile = sys.stdin elif input_file.endswith(".bz2"): try: ifile = bz2file.BZ2File(input_file, "r", buffering=0) except Exception, e: raise e elif input_file.endswith(".gz") or input_file.endswith(".bgz"): # try to open the file with Tabix try: ifile = pysam.Tabixfile(input_file, parser=pysam.asVCF()) self.indexed = True except Exception, e: try: ifile = gzip.GzipFile(input_file, "r") except Exception, e: raise e
def process_vcf_slice(tabix_file, chrm, start, stop, position_data): tbx = pysam.Tabixfile(tabix_file) tbx_lines = tbx.fetch(chrm, start, stop) numb_of_seqs = len(position_data._fields[9:]) alignment = np.zeros((stop-start,numb_of_seqs), np.string0) # This 'error handling' needs to be rewritten. current_data = [] if tbx_lines == None: return 'error' for line in tbx_lines: current_base = position_data._make(line.strip().split("\t")) base_calls = callSNPs(current_base, numb_of_seqs) current_data.append(base_calls.copy()) alignment = np.array(current_data) inform_sites = count_informative_sites(alignment) if current_base == None: return 'error' else: taxa = current_base._fields[9:] info = "tree 'chrm={0},start={1},stop={2},inform_sites={3}':".format(current_base.CHROM, start, stop, inform_sites) oneliner = array2OnelinerAlignment(info, taxa, alignment.transpose()) if ":" in oneliner and oneliner[-1] == ';': # this prevents bad alignments from getting printed return oneliner else: return 'error'
def get_vcf_lines(refvcf, pos_buffer, chr, pos): startpos = int(pos) - int(pos_buffer) endpos = int(pos) + int(pos_buffer) tabixfile = pysam.Tabixfile(refvcf) vcfline_generator = tabixfile.fetch(chr, startpos, endpos) lines = list(vcfline_generator) return lines