Beispiel #1
0
def annotate_indel_on_db(row, fasta, dbsnp, clnvr, chr_prefixed):
    """Check if there are equivalent indels on dbSNP and ClinVar
    for each indel. If exists, annotate with SNP info.

    Args:
        row (pandas.Series): with 'chr', 'pos', 'is_ins', 'indel_seq' lables
        fasta (str): path to .fa
        dbsnp (str): path to 00-All.151.indel.vcf.gz
        clnvr (str): path to clinvar.indel.vcf.gz
        chr_prefixed (bool): True if chromosome names in BAM are "chr"-prefixed
    Returns:
        report (IndelSnpFeatures): idl object reporting SNP info
    """
    chr = row["chr"]
    pos = row["pos"]
    idl_type = row["is_ins"]
    idl_seq = row["indel_seq"]

    # obj representing the indel in reference genome
    idl = curate_indel_in_genome(fasta, chr, pos, idl_type, idl_seq,
                                 chr_prefixed)
    # obj representing report of the indel
    report = IndelSnpFeatures(chr, pos, idl_type, idl_seq)

    # search for equivalent indels over pos +/- search_window nt
    search_window = 50
    start, end = pos - search_window, pos + search_window
    chr_vcf = row["chr"].replace("chr", "")

    for record in dbsnp.fetch(chr_vcf, start, end, parser=pysam.asTuple()):
        bambinos = vcf2bambino(record)
        for bb in bambinos:
            if idl_type == bb.idl_type and len(idl_seq) == len(bb.idl_seq):
                # indel on db representing in reference genome
                db_idl = curate_indel_in_genome(fasta, chr, bb.pos,
                                                bb.idl_type, bb.idl_seq,
                                                chr_prefixed)
                if idl == db_idl:
                    rs = record[2]
                    report.add_dbsnp_id(rs)
                    report.add_dbsnp_freq(dbsnp_freq(record))
                    #                   report.add_dbsnp_origin(dbsnp_origin(record))
                    report.add_dbsnp_common(dbsnp_common(record))

    for record in clnvr.fetch(chr_vcf, start, end, parser=pysam.asTuple()):
        bambinos = vcf2bambino(record)
        for bb in bambinos:
            if idl_type == bb.idl_type and len(idl_seq) == len(bb.idl_seq):
                db_idl = curate_indel_in_genome(fasta, chr, bb.pos,
                                                bb.idl_type, bb.idl_seq,
                                                chr_prefixed)
                if idl == db_idl:
                    id = record[2]
                    report.add_clnvr_id(id)
                    report.add_clnvr_freq(clnvr_freq(record))
                    #                   report.add_clnvr_origin(clnvr_origin(record))
                    report.add_clnvr_info(cln_info(record))

    return report
Beispiel #2
0
 def test_tabix_multi_ps_open(self):
     with open(self.tabix_ref,"rb") as fh1:
         with open(self.tabix_ref,"rb") as fh2:
             ps1 = pysam.tabix_file_iterator(fh1,pysam.asTuple())
             ps2 = pysam.tabix_file_iterator(fh2,pysam.asTuple())
             reader = MockReader(ps1,ps2,self.tabix_ref,tabix=True)
             for expected, found in zip(reader,self.bed_lines+self.bed_lines):
                 self.assertEqual(expected.strip("\n"),found.strip("\n"))
Beispiel #3
0
 def test_tabix_multi_ps_open(self):
     with open(self.tabix_ref, "rb") as fh1:
         with open(self.tabix_ref, "rb") as fh2:
             ps1 = pysam.tabix_file_iterator(fh1, pysam.asTuple())
             ps2 = pysam.tabix_file_iterator(fh2, pysam.asTuple())
             reader = MockReader(ps1, ps2, self.tabix_ref, tabix=True)
             for expected, found in zip(reader,
                                        self.bed_lines + self.bed_lines):
                 self.assertEqual(expected.strip("\n"), found.strip("\n"))
Beispiel #4
0
 def get_coverage(self, chrom, start, stop):
     if not self._binned:
         for row in self._tabixfile.fetch(chrom, start, stop+1, parser=pysam.asTuple()):
             yield json.loads(row[2])
     else:
         # Right now we don't include the region_end column in our coverage files,
         # so there's no way to make sure we get the bin overlapping the start of our query region.
         # To deal with it for now, we'll just use start-50
         # TODO: include region_end in coverage files.
         for row in self._tabixfile.fetch(chrom, max(1, start-50), stop+1, parser=pysam.asTuple()):
             d = json.loads(row[2])
             if d['end'] < start or d['start'] > stop: continue
             d['start'] = max(d['start'], start)
             d['end'] = min(d['end'], stop)
             yield d
Beispiel #5
0
 def get(self, chrom, position, ref, alt):
     if self.has_chr_prefix and not chrom.startswith('chr'):
         chrom = 'chr' + chrom
     elif not self.has_chr_prefix and chrom.startswith('chr'):
         chrom = chrom[3:]
     if not self.overlaps(chrom, position):
         self.chrom = chrom
         self.start = position
         self.stop = position + self.step_bp
         self.data = dict()
         for f in self.files:
             with pysam.Tabixfile(f, 'r') as tabix:
                 for row in tabix.fetch(self.chrom,
                                        self.start - 1,
                                        self.stop + 1,
                                        parser=pysam.asTuple()):
                     name = ':'.join(row[:4])
                     cadd_raw, cadd_phred = map(float, row[4:6])
                     if name in self.data:
                         if self.data[name][1] < cadd_phred:
                             self.data[name] = (cadd_raw, cadd_phred)
                     else:
                         self.data[name] = (cadd_raw, cadd_phred)
     return self.data.get(':'.join((chrom, str(position), ref, alt)),
                          (None, None))
Beispiel #6
0
    def _get_schema(self):
        if self._dataset is None:
            self._open_dataset()
        self._chroms = list(self._dataset.contigs)

        rec = next(self._dataset.fetch(self._chroms[0], parser=asTuple()))
        num_fields = len(rec)

        chrom_coord_dtype = np.int64
        dtypes = {
            "chrom": pd.CategorialDtype(self._chroms + ["NULL"], ordered=True),
            "start": chrom_coord_dtype,
            "end": chrom_coord_dtype,
            "name": str,
            "score": np.float32,
            "strand": bool,
        }
        self._dtype = {
            key: dtypes[key]
            for key in list(dtypes.keys())[:num_fields]
        }
        return Schema(
            datashape=None,
            dtype=self._dtype,
            shape=(None, len(self._dtype)),
            npartitions=len(self._chroms),
            extra_metadata={},
        )
Beispiel #7
0
def getphastscores(phastconsbed, gcoords):
    scores = []  #all scores
    scoresd = {}  #{geneid : [scores]}
    tbx = pysam.Tabixfile(phastconsbed)
    nt = 0
    ntwithscores = 0

    for gene in gcoords:
        pcscores = []  #list of all pc scores for coords in this gene
        chrm = gcoords[gene][0]
        coords = gcoords[gene][2]
        for coord in coords:
            nt += 1
            for row in tbx.fetch(chrm,
                                 coord,
                                 coord + 1,
                                 parser=pysam.asTuple()):
                score = row[4]
                if score:
                    ntwithscores += 1
                    scores.append(score)
                    pcscores.append(score)
        if pcscores:
            scoresd[gene] = pcscores

    print 'Interrogated {0} nucleotides. Found phastcons scores for {1} ({2}%) of them.'.format(
        nt, ntwithscores,
        round((ntwithscores / float(nt)), 4) * 100)

    return scores, scoresd
Beispiel #8
0
def parse_annotations(chrom, pos):

    AF_supAFR = CSQ = 'NA'

    if chrom == 'X':
        if pos <= 2699520:
            replace = 'X_PAR1'
        elif pos >= 154931044:
            replace = 'X_PAR2'
        else:
            replace = 'X_nonPAR'
    else:
        replace = chrom
    path_vcf = '../../../SHAPEIT/out_annotate_2016Dec28/{}.minAC1.no_mask.without_related.vcf.gz'.format(replace)
    tbx = pysam.TabixFile(path_vcf)
    for row in tbx.fetch(chrom, pos - 1, pos, parser=pysam.asTuple()):
        for _ in row[7].split(';'):
            if _ == 'DB':
                continue
            k, v = _.split('=')
            if k == 'AF_supAFR':
                AF_supAFR = v
            elif k == 'CSQ':
                CSQ = v

    return AF_supAFR, CSQ
Beispiel #9
0
    def testIteratorUncompressed(self):
        '''test iteration from uncompressed file.'''
        tmpfilename = 'tmp_testIteratorUncompressed'
        infile = gzip.open(self.filename, "rb")
        outfile = open(tmpfilename, "wb")
        outfile.write(infile.read())
        outfile.close()
        infile.close()

        with open(tmpfilename) as infile:
            for x, r in enumerate(pysam.tabix_iterator(infile,
                                                       pysam.asTuple())):
                self.assertEqual(self.compare[x], list(r))
                self.assertEqual(len(self.compare[x]), len(r))

                # test indexing
                for c in range(0, len(r)):
                    self.assertEqual(self.compare[x][c], r[c])

                # test slicing access
                for c in range(0, len(r) - 1):
                    for cc in range(c + 1, len(r)):
                        self.assertEqual(self.compare[x][c:cc], r[c:cc])

        os.unlink(tmpfilename)
Beispiel #10
0
    def annotate(self, bedline, genome):
        c = bedline.rstrip().rsplit("\t")
        chr = c[0]
        start = c[1]
        end = c[2]

        if not re.search('chr', chr):
            raise LookupError("chromosome names must start with chr: " + chr)
            return []
        if (self.genome != genome):
            raise LookupError(
                "tried to compare a %s bedfile to a %s annotation." %
                (genome, self.genome))
            return []
        else:
            annotations = []
            if (chr and start and end):
                if self.tabixContigs.has_key(chr):
                    tabixTupleParse = self.tabix.fetch(reference=chr,
                                                       start=int(start),
                                                       end=int(end),
                                                       parser=pysam.asTuple())
                    for tabixTuple in tabixTupleParse:
                        annotations.append(tabixTuple[3])
                    return uniqann(annotations)
                else:
                    return []
            else:
                raise LookupError(
                    "can't find chr,start,end. File must be tab-delimited")
                return []
Beispiel #11
0
def load_segmented_data(filepath, interval):
	
	res = genomic_interval_set()

	tabix = pysam.TabixFile(filepath)
	for row in tabix.fetch(interval.chrom, interval.start, interval.end, parser = pysam.asTuple()):
		
		chrom=row[0]
		start = int(row[1])
		end = int(row[2])
		
		try:
			name=row[3]
		except:
			name='.'

		try:
			score=float(row[4])
		except:
			score=-np.inf
	
		try:
			strand=row[5]
		except:
			strand='+'

		res += genomic_interval(chrom, start, end, name=name, score=score, strand=strand) 
	
	tabix.close()

	return res
def snp_cal(chromo, window_start, window_end):

    rows = tuple(
        parsevcf.fetch(region="%s:%s-%s" % (chromo, window_start, window_end),
                       parser=pysam.asTuple()))

    sites_total, sites_unmasked, sites_passing, sites_variant = 0, 0, 0, 0
    calls = [0] * len(samples)
    hets = [0] * len(samples)

    for line in rows:
        sites_total += 1
        if "CpGRep" in line[6]: continue
        sites_unmasked += 1
        if "FAIL" in line[6]: continue
        if "WARN" in line[6]: continue
        sites_passing += 1
        if line[4] != '.': sites_variant += 1
        for i in range(0, len(samples)):
            GT = line[i + 9]
            if GT[:1] != '.': calls[i] += 1
            if GT[:3] == '0/1': hets[i] += 1
            if GT[:3] == '0|1': hets[i] += 1

    output.write(
        '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
        (chromo, window_start, sites_total, sites_unmasked, sites_passing,
         sites_variant, '\t'.join(map(str, calls)), '\t'.join(map(str, hets))))
Beispiel #13
0
def annotate_variants_list(args, select_cursor, update_cursor):
    """
    Populate a new, user-defined column in the variants
    table with a INTEGER indicating the count of overlaps
    between the variant and the 
    annotation file.
    """
    add_requested_column(args.col_name, update_cursor)

    # For each, use Tabix to count overlaps with the user-defined
    # annotation file.  Update the variant row with the count.
    annos = pysam.Tabixfile(args.anno_file)
    select_cursor.execute("SELECT chrom, start, end, variant_id FROM variants")
    for row in select_cursor:
        hit_list = []
        for hit in annos.fetch(str(row['chrom']), int(row['start']), int(row['end']),
                               parser=pysam.asTuple()):
            try:
                hit_list.append(hit[int(args.col_extract) - 1])
            except IndexError:
                sys.exit("Column " + args.col_extract + " exceeds \
                          the number of columns in your \
                          annotation file. Exiting.")
                          
        hits = ",".join(hit_list)
        if len(hit_list):
            update_qry = "UPDATE variants SET " + args.col_name + " = '" + hits + \
                        "' WHERE variant_id = " + str(row['variant_id'])
        else:
            update_qry = "UPDATE variants SET " + args.col_name + " = NULL" + \
                        " WHERE variant_id = " + str(row['variant_id'])
        update_cursor.execute(update_qry)
Beispiel #14
0
    def load_counts(self, discfile, window_in, window_out):
        reg = '{0}:{1}-{2}'

        def _get_coords(pos, strand):
            if strand == '+':
                start, end = pos - window_out, pos + window_in
            else:
                start, end = pos - window_in, pos + window_out
            return start, end

        startA, endA = _get_coords(self.posA, self.strandA)
        startB, endB = _get_coords(self.posB, self.strandB)

        region = reg.format(self.chrA, startA, endA)

        counts = defaultdict(int)
        pairs = discfile.fetch(region=region, parser=pysam.asTuple())

        for pair in pairs:
            pair = _DiscPair(*pair)

            # Pairs were selected based on window around chrA; check chrB
            if pair.chrB != self.chrB:
                continue
            if not (startB <= pair.posB < endB):
                continue

            # Require pairs match breakpoint strand
            if pair.strandA != self.strandA or pair.strandB != self.strandB:
                continue

            counts[pair.sample] += 1

        self.pair_counts = pd.DataFrame.from_dict({'count': counts})
Beispiel #15
0
    def testTabixIndexedTsvCreation(self):
        inFile = "testdata/ESP6500SI-V2.chr1.snps_indels.head.25.txt"
        destDir = "out"

        # chr, startPos, endPos
        resultIndexedFile = TabixIndexer.index(destDir=destDir, inputFilename=inFile, fileColumnNumList=[0, 1, 1])
        self.assertTrue(os.path.exists(resultIndexedFile), "No index file was generated.")

        chrom = "1"
        start = "69594"
        end = "69594"
        tsvRecords = None
        tsvReader = pysam.Tabixfile(filename=resultIndexedFile)  # initialize the tsv reader
        try:
            tsvRecords = tsvReader.fetch(chrom, int(start)-1, int(end), parser=pysam.asTuple())
        except ValueError:
            pass

        tsvRecord = None
        for tsvRecord in tsvRecords:
            self.assertEqual(tsvRecord[5], "2,6190", "Value in column sixth does not match the expected value.")

        self.assertIsNotNone(tsvRecord, "No record for %s:%s-%s was found." % (chrom, start, end))

        os.remove(resultIndexedFile)
Beispiel #16
0
    def __iter__(self):
        from pysam import Tabixfile, asTuple
        f = Tabixfile(self.filename, mode='r')
        try:
            # header row
            if self.header is not None:
                yield self.header
            else:
                # assume last header line has fields
                h = list(f.header)
                if len(h) > 0:
                    header_line = text_type(h[-1], encoding='ascii')
                    yield tuple(header_line.split('\t'))

            # data rows
            for row in f.fetch(reference=self.reference,
                               start=self.start,
                               end=self.stop,
                               region=self.region,
                               parser=asTuple()):
                yield tuple(row)

        except:
            raise
        finally:
            f.close()
Beispiel #17
0
def get_snp_data(*args, **kwargs):
    '''
    proxy for TabixFile.fetch
    '''
    kwargs['multiple_iterators'] = True
    return TabixFile(SNP_FILE, parser=asTuple()).\
            fetch(*args, **kwargs)
Beispiel #18
0
    def testIteratorUncompressed(self):
        '''test iteration from uncompressed file.'''
        tmpfilename = 'tmp_testIteratorUncompressed'
        infile = gzip.open(self.filename, "rb")
        outfile = open(tmpfilename, "wb")
        outfile.write(infile.read())
        outfile.close()
        infile.close()

        with open(tmpfilename) as infile:
            for x, r in enumerate(pysam.tabix_iterator(
                    infile, pysam.asTuple())):
                self.assertEqual(self.compare[x], list(r))
                self.assertEqual(len(self.compare[x]), len(r))

                # test indexing
                for c in range(0, len(r)):
                    self.assertEqual(self.compare[x][c], r[c])

                # test slicing access
                for c in range(0, len(r) - 1):
                    for cc in range(c + 1, len(r)):
                        self.assertEqual(self.compare[x][c:cc],
                                         r[c:cc])

        os.unlink(tmpfilename)
Beispiel #19
0
def vci_query(vci_file, region, fasta_file):
    # ./bin/g2gtools vciquery -v data/mm/REF2CAST.vci.gz -r "1:13009000-13009800" -d
    start = time.time()

    vci_file = g2g_utils.check_file(vci_file, 'r')

    LOG.info("VCI File: {}".format(vci_file))
    LOG.info("Region: {}".format(region))

    vci_f = VCIFile(vci_file, seq_ids=[region.seq_id])
    vci_f.parse(False)

    mappings = vci_f.find_mappings(region.seq_id, region.start, region.end)

    for m in mappings:
        LOG.debug(m)

    start_pos = mappings[0].to_start
    end_pos = mappings[-1].to_end

    LOG.debug("Converted Region: {}:{}-{}".format(region.seq_id, start_pos + 1,
                                                  end_pos + 1))

    for line in vci_f.fetch(reference=region.seq_id,
                            start=start_pos,
                            end=end_pos,
                            parser=pysam.asTuple()):
        print(str(line))

    LOG.info("VCI parsed: {0}".format(g2g_utils.format_time(
        start, time.time())))
Beispiel #20
0
    def fetch_highest_scores(self, chrom, pos_begin, pos_end):
        result = dict()

        stripped_chrom = handle_chrom_prefix(self.chr_prefix, chrom)

        try:
            for line in self.accessor.direct_infile.fetch(
                    stripped_chrom, pos_begin - 1, pos_end,
                    parser=pysam.asTuple()):
                line = LineAdapter(self.accessor.score_file, line)
                for column in self.score_names:
                    score_index = self.schema.col_names.index(column)
                    score_value = float(line[score_index]) \
                        if str.lower(line[score_index]) \
                        != self.config.general.no_score_value else np.nan
                    result[column] = max(score_value,
                                         result.get(column, np.nan))
            return result

        except ValueError as ex:
            print(
                f"could not find region {chrom}:{pos_begin}-{pos_end} "
                f"in {self.score_filename}: ",
                ex,
                file=sys.stderr,
            )
            return result
def getMinMaxPositions(depthFile, contig):
    with closing(pysam.TabixFile(depthFile)) as tabix:
        first_entry = None
        for first_entry in tabix.fetch(contig, 0, parser=pysam.asTuple()):
            break
        last_Mbp = 0
        while any(True for _ in tabix.fetch(
                contig, last_Mbp, parser=pysam.asTuple())):
            last_Mbp += 5000000
        last_entry = None
        for last_entry in tabix.fetch(contig,
                                      last_Mbp - 5000000,
                                      parser=pysam.asTuple()):
            pass
        return (long(first_entry[1]) if first_entry is not None else None,
                long(last_entry[1]) if last_entry is not None else None)
def bed_regions(bed_file, chromo):
    open_bed = pysam.TabixFile(bed_file)
    for line in open_bed.fetch(chromo, parser=pysam.asTuple()):
        start = int(line[1])
        end = int(line[2])

        yield start, end
    def _create_column_dict_from_tabix_index(self, mutation):
        mut_start = int(mutation.start)
        mut_end = int(mutation.end)
        chrom = mutation.chr
        vals = {}
        try:
            # tabix needs position - 1
            tsv_records = self.tsv_reader.fetch(chrom,
                                                mut_start - 1,
                                                mut_end,
                                                parser=pysam.asTuple())
            i = -1
            for i, tsv_record in enumerate(tsv_records):
                if not tsv_record:  # skip in case no records are found
                    continue

                logging.getLogger(__name__).debug("Got a record.")
                # Determine whether the new tsv record matches mutation or not
                if self._is_matching(mutation, tsv_record):
                    for colName in self.output_tsv_headers:
                        if colName.strip() == "":
                            continue
                        val = tsv_record[self.tsv_headers[colName]]
                        if colName not in vals:
                            vals[colName] = [val]
                        else:
                            vals[colName] += [val]
            logging.getLogger(__name__).debug("Processed %d records." %
                                              (i + 1))
        except ValueError as ve:
            msg = "Exception when looking for tsv records. Empty set of records being returned: " + repr(
                ve)
            logging.getLogger(__name__).debug(msg)
        return vals
Beispiel #24
0
def get_snp_data(*args, **kwargs):
    '''
    proxy for TabixFile.fetch
    '''
    kwargs['multiple_iterators'] = True
    return TabixFile(SNP_FILE, parser=asTuple()).\
            fetch(*args, **kwargs)
Beispiel #25
0
def get_info_from_variants(chrom, start, stop, field):
    """
    Check the format of the CHROM field in VCF file
    Arguments:
        chrom (str): chromosome of search region
        start (str): start position of search region
        stop (str): stop position of search region
        field (str): field to extract from VCF file
    Returns:
        list (tuple): list of tuples by (chromosome, field value)
    """
    tuples = []
    format_index = get_format_index()
    field_index = get_format_field_index(field)
    try:
        for entry in tbx.fetch(chrom, int(start), int(stop), parser=asTuple()):

            if float(entry[5]) > 20:
                tuples.append(
                    (chrom, get_field_value(entry, format_index, field_index)))
    except ValueError:
        print("No variants found in region {}:{}:{}".format(
            chrom, start, stop))
        pass

    return tuples
    def annotate(self,bedline,genome):
        c = bedline.rstrip().rsplit("\t")
        chr   = c[0]
        start = c[1]
        end   = c[2]

        if not re.search('chr',chr):
            raise LookupError("chromosome names must start with chr: " + chr)
            return []
        if (self.genome != genome):
            raise LookupError("tried to compare a %s bedfile to a %s annotation." % (genome,self.genome))
            return []
        else:
            annotations = []
            if (chr and start and end):
                if self.tabixContigs.has_key(chr):
                    tabixTupleParse = self.tabix.fetch(reference=chr,
                                                       start=int(start),
                                                       end=int(end),
                                                       parser=pysam.asTuple())
                    for tabixTuple in tabixTupleParse:
                        annotations.append(tabixTuple[3])
                    return uniqann(annotations)
                else:
                    return []
            else:
                raise LookupError("can't find chr,start,end. File must be tab-delimited")
                return []
def annotate_indel_on_db(idl, idl_report, db, genome, chr_prefixed,
                         vcf_chr_prefixed, preset):

    chr, pos, idl_type, idl_seq = idl.chr, idl.pos, idl.idl_type, idl.idl_seq

    # search for equivalent indels over pos +/- search_window nt
    search_window = 50
    start, end = pos - search_window, pos + search_window

    chr_vcf = chr.replace("chr", "")
    chr_vcf = "chr" + chr_vcf if vcf_chr_prefixed else chr_vcf

    for record in db.fetch(chr_vcf, start, end, parser=pysam.asTuple()):
        bambinos = vcf2bambino(record)
        for bb in bambinos:
            if idl_type == bb.idl_type and len(idl_seq) == len(bb.idl_seq):
                # indel on db representing in reference genome
                db_idl = curate_indel_in_genome(genome, chr, bb.pos,
                                                bb.idl_type, bb.idl_seq,
                                                chr_prefixed)
                if idl == db_idl:
                    if preset == "dbsnp":
                        idl_report.add_dbsnp_id(record[2])
                        idl_report.add_dbsnp_freq(dbsnp_freq(record))
                        # idl_report.add_dbsnp_origin(dbsnp_origin(record))
                        idl_report.add_dbsnp_common(dbsnp_common(record))
                    elif preset == "clinvar":
                        idl_report.add_clnvr_id(id)
                        idl_report.add_clnvr_freq(clnvr_freq(record))
                        # idl_report.add_clnvr_origin(clnvr_origin(record))
                        idl_report.add_clnvr_info(cln_info(record))
                    else:
                        idl_report.add_germline_id(record[2])

    return idl_report
Beispiel #28
0
    def testCopy(self):
        a = self.tabix.fetch(parser=pysam.asTuple()).next()
        b = copy.copy(a)
        self.assertEqual(a, b)

        a = self.tabix.fetch(parser=pysam.asGTF()).next()
        b = copy.copy(a)
        self.assertEqual(a, b)
def readDepthChunk(depthFile, contig, start, end):
    chunk = dict()
    if start > 0:
        start -= 1
    with closing(pysam.Tabixfile(depthFile)) as tabix:
        for row in tabix.fetch(contig, start, end, parser=pysam.asTuple()):
            chunk[long(row[1])] = int(row[3])
    return chunk
Beispiel #30
0
    def testCopy(self):
        a = self.tabix.fetch(parser=pysam.asTuple()).next()
        b = copy.copy(a)
        self.assertEqual(a, b)

        a = self.tabix.fetch(parser=pysam.asGTF()).next()
        b = copy.copy(a)
        self.assertEqual(a, b)
    def testTuple( self ):

        for x, r in enumerate(self.tabix.fetch( parser = pysam.asTuple() )):
            self.assertEqual( self.compare[x], list(r) )

            self.assertEqual( len(self.compare[x]), len(r) )
            for c in range(0,len(r)):
                self.assertEqual( self.compare[x][c], r[c] )
Beispiel #32
0
    def aggregate(self, chrom):
        import pysam
        filepath = self.filepath
        binsize = self.gs.binsize
        idmap = self.gs.idmap
        chromsizes = self.gs.chromsizes
        chrom_binoffset = self.gs.chrom_binoffset
        chrom_abspos = self.gs.chrom_abspos
        start_abspos = self.gs.start_abspos
        C2, P2 = self.C2, self.P2

        these_bins = self.gs.fetch(chrom)
        rows = []
        with pysam.TabixFile(filepath, 'r', encoding='ascii') as f:
            parser = pysam.asTuple()
            accumulator = Counter()

            for bin1_id, bin1 in these_bins.iterrows():
                for line in f.fetch(chrom, bin1.start, bin1.end,
                                    parser=parser):
                    chrom2 = line[C2]
                    pos2 = int(line[P2])

                    try:
                        cid2 = idmap[chrom2]
                    except KeyError:
                        # this chrom2 is not requested
                        continue

                    if binsize is None:
                        lo = chrom_binoffset[cid2]
                        hi = chrom_binoffset[cid2 + 1]
                        bin2_id = lo + np.searchsorted(
                            start_abspos[lo:hi],
                            chrom_abspos[cid2] + pos2,
                            side='right') - 1
                    else:
                        bin2_id = chrom_binoffset[cid2] + (pos2 // binsize)

                    accumulator[bin2_id] += 1

                if not accumulator:
                    continue

                rows.append(
                    pandas.DataFrame(
                        {
                            'bin1_id': bin1_id,
                            'bin2_id': list(accumulator.keys()),
                            'count': list(accumulator.values())
                        },
                        columns=['bin1_id', 'bin2_id',
                                 'count']).sort_values('bin2_id'))

                accumulator.clear()

        logger.info(chrom)
        return pandas.concat(rows, axis=0) if len(rows) else None
Beispiel #33
0
def bed_regions(bed_file, chromo):
    open_bed = pysam.TabixFile(bed_file)
    coords = []
    for line in open_bed.fetch(chromo, parser=pysam.asTuple()):
        start = int(line[1])
        end = int(line[2])
        coords += range(start, end)

    return coords
Beispiel #34
0
def by_region(region, version, species, limit=None):
    """Perform the search by region.

    Args:
        region (str): The region to look for SNPs.
        version (int): The Ensembl version number.
        species (str): The Ensembl species identifier.
        limit (int, optional): Maximum number of SNPs to return, ``None`` for
            all.

    Returns:
        list: All the SNPs in `region`.  Each element is another ``list`` with
        the following values:
            * chromosome
            * position
            * SNP identifier
            * reference allele
            * alternate allele

    Raises:
        ValueError: When `region` is empty.
    """
    LOG = utils.get_logger()

    LOG.debug(sqlite3.version_info)
    LOG.debug(sqlite3.version)

    LOG.debug('range={}'.format(region))
    LOG.debug('version={}'.format(version))
    LOG.debug('species_id={}'.format(species))
    LOG.debug('limit={}'.format(limit))

    try:

        if not region:
            raise ValueError('no ids were passed in')

        new_region = fetch_utils.str_to_region(region)

        tabix_file = fetch_utils.get_tabix_file(version, species)
        tbx = pysam.TabixFile(tabix_file)

        start_time = time.time()

        snps = []
        for row in tbx.fetch('{}'.format(new_region.chromosome),
                             new_region.start_position,
                             new_region.end_position,
                             parser=pysam.asTuple()):
            snps.append(list(row[:5]))

        LOG.info('Done: {}'.format(utils.format_time(start_time, time.time())))

        return snps
    except Exception as e:
        LOG.error('Error: {}'.format(e))
        return None
Beispiel #35
0
 def testUnset(self):
     for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
         self.assertEqual(self.compare[x], list(r))
         c = list(r)
         e = list(r)
         for y in range(len(r)):
             r[y] = c[y] = None
             e[y] = ""
             self.assertEqual(c, list(r))
             self.assertEqual("\t".join(e), str(r))
Beispiel #36
0
 def match(self, chrm, position, indels=True):
     ''' match single position in CADD file; default indels to true
     b/c will be using this more with the indel file'''
     tbxFh = self._indel if indels else self._snv
     caddChr = 'MT' if chrm == 'M' else xstr(chrm)
     try:
         return tbxFh.fetch(caddChr, int(position) - 1, int(position), parser=pysam.asTuple())
     except ValueError as e: # happens sometimes on chrm M/MT
         warning("WARNING", e)
         return []
Beispiel #37
0
 def _region_reset(self, region):
     region = handle_chrom_prefix(self._has_chrom_prefix, region)
     try:
         self.lines_iterator = self.infile.fetch(region=region,
                                                 parser=pysam.asTuple())
     except ValueError as ex:
         print(f"could not find region {region} in {self.filename}:",
               ex,
               file=sys.stderr)
         self.lines_iterator = None
Beispiel #38
0
 def testUnset( self ):
     for x, r in enumerate(self.tabix.fetch( parser = pysam.asTuple() )):
         self.assertEqual( self.compare[x], list(r) )
         c = list(r)
         e = list(r)
         for y in range(len(r)):
             r[y] = c[y] = None
             e[y] = ""
             self.assertEqual( c, list(r) )
             self.assertEqual( "\t".join(e), str(r) )
Beispiel #39
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf')
    parser.add_argument('famfile', type=argparse.FileType('r'))
    parser.add_argument('-c', '--countfile', required=True)
    parser.add_argument('-d', '--discfile', required=True)
    parser.add_argument('--discfile-index')
    parser.add_argument('--countfile-index')
    parser.add_argument('--background', type=int, default=160)
    parser.add_argument('--max-parents', type=float, default=10)
    parser.add_argument('petest', type=argparse.FileType('w'), help='fout')
    parser.add_argument('srtest', type=argparse.FileType('w'), help='fout')
    args = parser.parse_args()

    vcf = pysam.VariantFile(args.vcf)
    fam = parse_famfile(args.famfile)

    if args.discfile_index is None:
        discfile = pysam.TabixFile(args.discfile, parser=pysam.asTuple())
    else:
        discfile = pysam.TabixFile(args.discfile,
                                   index=args.discfile_index,
                                   parser=pysam.asTuple())

    if args.countfile_index is None:
        countfile = pysam.TabixFile(args.countfile, parser=pysam.asTuple())
    else:
        countfile = pysam.TabixFile(args.countfile,
                                    index=args.countfile_index,
                                    parser=pysam.asTuple())

    header = 'name sample log_pval called_median bg_median'.split()
    args.petest.write('\t'.join(header) + '\n')

    header = 'name sample coord pos log_pval called_median bg_median'.split()
    args.srtest.write('\t'.join(header) + '\n')

    runner = DenovoTestRunner(vcf, fam, countfile, discfile, args.petest,
                              args.srtest, args.background, args.max_parents)
    runner.run()
    def get_gapped_wnds(self,chr,tbx_gaps):
        
        gapped_wnds = []

        for t in tbx_gaps.fetch(chr,parser=pysam.asTuple()):
            _chr,start,end = t
            wnd_start = np.searchsorted(self.starts,start) 
            wnd_end = np.searchsorted(self.starts,end) 
            gapped_wnds.append(tuple([wnd_start,wnd_end]))
        
        return gapped_wnds 
Beispiel #41
0
    def get_gapped_wnds(self, chr, tbx_gaps):

        gapped_wnds = []

        for t in tbx_gaps.fetch(chr, parser=pysam.asTuple()):
            _chr, start, end = t
            wnd_start = np.searchsorted(self.starts, start)
            wnd_end = np.searchsorted(self.starts, end)
            gapped_wnds.append(tuple([wnd_start, wnd_end]))

        return gapped_wnds
Beispiel #42
0
def get_ref_alt_from_dbSNP(chrom, pos, path_vcf):

    tbx = pysam.TabixFile(path_vcf)
    for row in tbx.fetch(chrom, pos - 1, pos, parser=pysam.asTuple()):
        if len(row[3]) == 1 and 1 in map(len, row[4].split(',')):
            break
    else:
        stop_not_found_in_dbSNP

    assert ',' not in row[4], row

    return row[3], row[4]
Beispiel #43
0
 def __init__(self, task_queue, results_queue, family, args):
     multiprocessing.Process.__init__(self)
     self.task_queue = task_queue
     self.family = family
     self.results_queue = results_queue
     self.verbosity = args.verbose
     self.phased = args.phased
     self.cadd_file = args.cadd_file[0]
     self.chr_prefix = args.chr_prefix
                 
     if self.cadd_file:
         self.cadd_file = Tabixfile(self.cadd_file, parser = asTuple())
 def get_overlapping_wnds(self,chr,tbx):
     wnd_starts, wnd_ends = self.get_wnds_by_chr(chr)
     bnds = np.array([ [int(l[1]),int(l[2])] 
                         for l in tbx.fetch(chr,parser=pysam.asTuple()) ])
      
     start_idxs = np.searchsorted(wnd_starts,bnds[:,0])
     end_idxs = np.searchsorted(wnd_starts,bnds[:,1])
     #print start_idxs
     #print end_idxs
     ret = np.c_[start_idxs,end_idxs]
     
     return ret
Beispiel #45
0
def parse_dbSNP(args, chrom, pos, ref, alt):

    ## todo: check this function... I might miss variants in dbSNP...

    tbx = pysam.TabixFile(args.dbSNP)
    row = None
    for row in tbx.fetch(chrom, pos - 1 - 1, pos, parser=pysam.asTuple()):
        print(row, file=sys.stderr)
        if any([
            ## SNP.
            row[3] == ref and alt in row[4].split(','),
            ## SNP in dbSNP and MNP in preselected.txt
            all([
                row[3] == ref[0],
                len(set(alt[0].split(',')) & set(row[4].split(','))) > 0,
                len(ref) in map(len, alt.split(',')),
                len(row[3]) == 1,
                ]),
            ## Insertion.
            all([
                int(row[1]) == pos,
                len(row[3]) == 1,
                ref == '-',
                ## One or more ALTs overlap (e.g. rs3835252).
                len(set(x[1:] for x in row[4].split(',')) & set(alt.split(','))) >= 1,
                ]),
            ## Deletion.
            all([
                int(row[1]) == pos,
                len(row[4]) == 1,
                alt == '-',
                row[3][:1] == ref,
                len(row[3]) > 1,
                len(row[4]) == 1,
                ]),
            ## Deletion.
            all([
                int(row[1]) + 1 == pos,
                len(row[3]) == len(ref) + 1,
                set(map(len, row[4].split(','))) == set([1]),
                alt == '-',
                row[3][1:] == ref,
                ]),
            ]):
            rsID = row[2]
            break
    ## Not found in dbSNP.
    else:
        rsID = ''

    return rsID
Beispiel #46
0
    def testRead(self):

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
            self.assertEqual(self.compare[x], list(r))
            self.assertEqual(len(self.compare[x]), len(r))

            # test indexing
            for c in range(0, len(r)):
                self.assertEqual(self.compare[x][c], r[c])

            # test slicing access
            for c in range(0, len(r) - 1):
                for cc in range(c + 1, len(r)):
                    self.assertEqual(self.compare[x][c:cc], r[c:cc])
Beispiel #47
0
 def get_dup_overlap(self, tbx_dups):
     #assuming dup file is collapsed
     min_s = min(self.all_starts)
     max_e = max(self.all_ends)
     t = 0
         
     for l in tbx_dups.fetch(self.chr,min_s,max_e,parser=pysam.asTuple()):
         c,s,e = l
         s,e = int(s), int(e)
         curr_s = s>min_s and s or min_s
         curr_e = e<max_e and e or max_e
         t += curr_e-curr_s
     if t==0: return 0.0
     
     return float(t) / float(max_e - min_s)
Beispiel #48
0
    def testWrite(self):

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
            self.assertEqual(self.compare[x], list(r))
            c = list(r)
            for y in range(len(r)):
                r[y] = "test_%05i" % y
                c[y] = "test_%05i" % y
            self.assertEqual([x for x in c], list(r))
            self.assertEqual("\t".join(c), str(r))
            # check second assignment
            for y in range(len(r)):
                r[y] = "test_%05i" % y
            self.assertEqual([x for x in c], list(r))
            self.assertEqual("\t".join(c), str(r))
Beispiel #49
0
 def __init__(self, inFile, parser=pysam.asTuple()):
     # inFile is passed in but is never used, and yet the TabixReader works. How?!
     # This inFile magic is because Tabixfile is a Cython object that uses def __cinit__ 
     # rather than def __init__; the former is called automatically exactly once for the 
     # base class prior to any use of __init__. Use of subsequent __init__ should obey 
     # normal inheritance rules (assuming inheritance from object and it's not an old-style class). 
     # So __cinit__ sets the input file, and then our __init__ (which doesn't override a base method) 
     # is called and sets the parser. 
     # This could all break in the future if pysam moves away from __cinit__, but doing so would
     # reduce performance and so it seems unlikely.
     # See:
     #   https://github.com/cython/cython/blob/master/docs/src/userguide/special_methods.rst#id19
     #   https://github.com/pysam-developers/pysam/blob/master/pysam/libctabix.pyx#L331
     #super(TabixReader, self).__init__(inFile, parser=parser)
     self.parser = parser
Beispiel #50
0
    def testIteratorCompressed(self):
        """test iteration from compressed file."""
        with gzip.open(self.filename) as infile:
            for x, r in enumerate(pysam.tabix_iterator(infile, pysam.asTuple())):
                self.assertEqual(self.compare[x], list(r))
                self.assertEqual(len(self.compare[x]), len(r))

                # test indexing
                for c in range(0, len(r)):
                    self.assertEqual(self.compare[x][c], r[c])

                # test slicing access
                for c in range(0, len(r) - 1):
                    for cc in range(c + 1, len(r)):
                        self.assertEqual(self.compare[x][c:cc], r[c:cc])
def plot_GC(chr,tbx_gc,cp_vect,starts,ends):
    F=open("GC.txt",'w')
    cp_vect = cp_vect.astype(np.float64) 
    var = get_windowed_variance(cp_vect,50)
    F.write("var\tgc\n") 
    for i in xrange(50,starts.shape[0],101):
        s = starts[i-50]
        e = ends[i+50]
        gc = np.mean(np.array([float(l[3]) for l in tbx_gc.fetch(chr,s,e,parser=pysam.asTuple())]))
        if var[i] != 0:
            print >>F,"%f\t%f"%( var[i],gc )
        



    exit(1)
Beispiel #52
0
def load_data(filepath, interval, data_columns=[5], dtype=np.float):

	""" Loads numeric data columns from a BED-format TABIX file """

	res = np.zeros((len(interval), len(data_columns)), dtype = dtype)

	tabix = pysam.TabixFile(filepath)

	for row in tabix.fetch(interval.chrom, interval.start, interval.end, parser = pysam.asTuple()):
	    i = int(row[1])- interval.start
	    
	    for j, col in enumerate(data_columns):
	    	res[i, j] = dtype(row[col-1])

	tabix.close()
	    
	return res
Beispiel #53
0
def get_snps(pid):
    '''
    return sequences mentioned in SNPData.csv
    '''
    coords = map(make_coord_string, snps.COORDINATES.values())
    search_args = {
        'coordinate': ','.join(coords),
        'patient': pid,
        '_count': 100000
    }
    seq_bundle = call_api('/Sequence', search_args) 
    seqs = (entry['content'] for entry in seq_bundle['entry'])
    translation_f = TabixFile(SNP_TRANSLATION_FNAME, parser=asTuple()) 
    return jsonify({
        get_rsid(translation_f, seq): seq['observedSeq']
        for seq in seqs
    })
Beispiel #54
0
    def get_callset(self, exclude_tbxs=[], min_exclude_ratio=0.3, min_exclude_len=20000):
        """
        return segments and their copies in genome 
        coordinates
        adding subtraction of gaps
        """
        c=callset()

        wnd_starts,wnd_ends,cps = self.segment_edges
        wnd_starts,wnd_ends,cps = np.array(wnd_starts), np.array(wnd_ends), np.array(cps)
        
        for i in xrange(len(wnd_starts)-1):
            start, end = self.starts[wnd_starts[i]], self.ends[wnd_ends[i]]
            wnd_start, wnd_end = wnd_starts[i], wnd_ends[i]
            
            #exclude totally anything in these tbxs
            for exclude_tbx in exclude_tbxs:
                ex_starts, ex_ends = [], []
                for l in exclude_tbx.fetch(self.chr,start,end,parser=pysam.asTuple()):
                    _chr,_s,_e = l
                    _s, _e = int(_s), int(_e)
                    if _e-_s > min_exclude_len:
                        ex_starts.append(_s)
                        ex_ends.append(_e)

            n_exclude = len(ex_starts) 
            if n_exclude:
                ex_coords = self.get_exclude_coords(ex_starts, ex_ends)

                wnd_start_ends = self.subtract_excluded(wnd_start, wnd_end, ex_coords)
            else:
                wnd_start_ends = [tuple([wnd_start, wnd_end])]
            
            for i in xrange(len(wnd_start_ends)):
                wnd_start = wnd_start_ends[i][0]
                wnd_end = wnd_start_ends[i][1]

                c.add_call(self.chr,
                           self.starts[wnd_start],
                           self.ends[wnd_end],
                           np.mean(self.cp_data[wnd_start:wnd_end]),
                           wnd_start,
                           wnd_end,
                           self.cp_data[wnd_start:wnd_end])
        return c 
Beispiel #55
0
    def testRead(self):

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
            c = self.compare[x]
            self.assertEqual(c, list(r))
            self.assertEqual(len(c), len(r))

            # test indexing
            for y in range(0, len(r)):
                self.assertEqual(c[y], r[y])

            # test slicing access
            for y in range(0, len(r) - 1):
                for cc in range(y + 1, len(r)):
                    self.assertEqual(c[y:cc],
                                     r[y:cc])
            self.assertEqual("\t".join(map(str, c)),
                             str(r))
Beispiel #56
0
def _tabix_iteradaptor(stream):
    """Open `stream` as an iterator over a `tabix`_ file, returning raw strings from tabix data.
    
    Parameters
    ----------
    streams : open file-like, :class:`pysam.ctabix.tabix_file_iterator`
    
    Returns
    -------
    generator
        Generator of tab-delimited string records in `tabix`_ file
    """
    if not isinstance(stream,(pysam.ctabix.tabix_generic_iterator,
                              pysam.ctabix.tabix_file_iterator)
                      ):
        stream = pysam.tabix_file_iterator(stream,pysam.asTuple())
    
    return (str(X) for X in stream)
Beispiel #57
0
    def process(self):
        if self.genelist:
            gene_list = pysam.TabixFile(self.genelist)

        with open(self.input, 'r') as fin:
            with open(self.output, 'w') as fout:
                for line in fin:

                    line = line.strip()
                    if line.startswith('#chr'):
                        header = line.split('\t')
                        if self.genelist:
                            header += ['PIDD_GENE', 'Inheritance', 'Phenotype']
                        fout.write('\t'.join(header) + '\n')
                        continue
                    elif line.startswith('##'):
                        continue

                    try:
                        row = OrderedDict(zip(header, line.split('\t')))
                    except:
                        continue

                    if self.genelist:
                        PIDD_GENE = []
                        Inheritance = []
                        Phenotype = []
                        try:
                            for genepanel_line in gene_list.fetch(row['#chr'],
                                                                  int(row['start']),
                                                                  int(row['end']),
                                                                  parser=pysam.asTuple()):
                                PIDD_GENE += [genepanel_line[3]]
                                Inheritance += [genepanel_line[4]]
                                Phenotype += [genepanel_line[5]]
                            row['PIDD_GENE'] = "|".join(PIDD_GENE) if PIDD_GENE else 'NA'
                            row['Inheritance'] = "|".join(Inheritance) if Inheritance else 'NA'
                            row['Phenotype'] ="|".join(Phenotype) if Phenotype else 'NA'
                        except ValueError:
                            pass

                    if self.filter_line(row):
                        fout.write('\t'.join(row.values()) + '\n')
Beispiel #58
0
def _get_hits(coords, annotation, parser_type):
    """Retrieve BED information, recovering if BED annotation file does have a chromosome.
    """
    if parser_type == "bed":
        parser = pysam.asBed()
    elif parser_type == "vcf":
        parser = pysam.asVCF()
    elif parser_type == "tuple":
        parser = pysam.asTuple()
    elif parser_type is None:
        parser = None
    else:
        raise ValueError("Unexpected parser type: %s" % parser)
    chrom, start, end = coords
    try:
        hit_iter = annotation.fetch(str(chrom), start, end, parser=parser)
    # catch invalid region errors raised by ctabix
    except ValueError:
        hit_iter = []
    return hit_iter
Beispiel #59
0
 def __init__(self, filepath, chromsizes, bins):
     try:
         import pysam
     except ImportError:
         raise ImportError("pysam is required to read tabix files")
     n_bins = len(bins)
     self.idmap = pandas.Series(index=chromsizes.keys(), data=range(len(chromsizes)))
     self.bins = bins
     self.binsize = get_binsize(bins)
     self.pairsfile = pysam.TabixFile(filepath, 'r', encoding='ascii')
     self.parser = pysam.asTuple()
     # number of lines in file
     p1 = subprocess.Popen(['pigz',  '-p', '8',  '-dc', filepath], stdout=subprocess.PIPE)
     p2 = subprocess.Popen(['wc', '-l'], stdin=p1.stdout, stdout=subprocess.PIPE)
     self.n_records = int(p2.communicate()[0])
     # convert genomic coords of bin starts to absolute
     self.idmap = pandas.Series(index=chromsizes.keys(), data=range(len(chromsizes)))
     bin_chrom_ids = self.idmap[bins['chrom']].values
     self.cumul_length = np.r_[0, np.cumsum(chromsizes)]
     self.abs_start_coords = self.cumul_length[bin_chrom_ids] + bins['start']
     # chrom offset index: chrom_id -> offset in bins
     chrom_nbins =  bins.groupby(bin_chrom_ids, sort=False).size()
     self.chrom_offset = np.r_[0, np.cumsum(chrom_nbins)]