Example #1
0
def get_interval_data(genes, INT):
    '''
    Get interval data for gene from input files in int_fns. 
    '''
    for fn, name in INT:
        tb = TabixFile(fn)

        for g in genes:

            # Get region for searching replication timing data
            g_len = g.total_length
            midp = round((g.start + g.stop) / 2)
            min_width = 10e3  # search region at least 1 kb
            if g_len < min_width:
                start = midp - round(min_width / 2)
                stop = midp + round(min_width / 2)
                gstr = '%s:%d-%d' % (g.chrom, start, stop)
            else:
                gstr = '%s:%d-%d' % (g.chrom, g.start, g.stop)

            # Call to tabix to get dat from bedGraph
            try:
                it_genes = tb.fetch(gstr)
            except ValueError:  # handle regions where no interval can be made
                g.intervalData[name] = None
                continue
            intData = []
            for itr in it_genes:
                if itr == '': continue
                itr = itr.split('\t')
                intData.append(float(itr[-1]))
            if len(intData) > 0:
                g.intervalData[name] = np.mean(intData)
                continue
            else:
                # Extend search if value not found
                extends0 = [50e3, 100e3, 500e3, 1e6]
                extends = []
                for e in extends0:
                    if e > g_len: extends.append(e)
                found = False
                for e in extends:
                    start = max(1, midp - round(e / 2))
                    stop = midp + round(e / 2)
                    gstr = '%s:%d-%d' % (g.chrom, start, stop)

                    it_genes = tb.fetch(gstr)
                    for itr in it_genes:
                        if itr == '': continue
                        itr = itr.split('\t')
                        intData.append(float(itr[-1]))
                        found = True
                    if found == True:
                        g.intervalData[name] = np.mean(intData)
                        break

                if found == False:
                    g.intervalData[name] = None

    return genes
Example #2
0
def classify_peak(peak, sample, motifs):
    pc_peak = (peak.contig, 
               peak.start+peak.summit-300, 
               peak.start+peak.summit+300)
    nc_peak = (peak.contig, 
               peak.start+peak.summit-2000, 
               peak.start+peak.summit+2000)
    status = []
    for motif in motifs:
        fname = tf_peak_fnames[
            (motif.tf_name, RMID_term_name_mapping[sample])][0]
        fp = TabixFile(fname)
        if peak[0] not in fp.contigs: 
            status.append(0)
            continue
        pc_peaks = list(fp.fetch(*pc_peak))
        if len(pc_peaks) > 0:
            status.append(1)
            continue
        nc_peaks = list(fp.fetch(*nc_peak))
        if len(nc_peaks) == 0:
            status.append(-1)
        else:
            status.append(0)
    return status
Example #3
0
def main():
    args = parse_arguments()
    print(VariantFile(BUILD_TO_VCF[args.reference_build]).header)
    vcf_file = TabixFile(BUILD_TO_VCF[args.reference_build])
    rsid_file = TabixFile(BUILD_TO_RSID[args.reference_build],
                          index=f'{BUILD_TO_RSID[args.reference_build]}.csi')

    def rsid_to_coordinates(rsid):
        rs_number = int(rsid.replace('rs', ''))
        for row in rsid_file.fetch('rs', rs_number - 1, rs_number):
            chrom, pos = row.split()[2:]
            yield chrom, int(pos)

    for variant in args.variants:
        if COORD_REGEX.match(variant):
            chrom, pos = variant.split(':')
            chrom = chrom_to_hgvs(chrom, reference_build=args.reference_build)
            pos = int(pos)
            for row in vcf_file.fetch(chrom, pos - 1, pos):
                print(row)
        elif RSID_REGEX.match(variant):
            for chrom, pos in rsid_to_coordinates(variant):
                for row in vcf_file.fetch(chrom, pos - 1, pos):
                    print(row)
        else:
            raise RuntimeError('Improperly formatted query')
Example #4
0
 def __init__(self, gnomad_file, frequency_table):
     self.gnomad = TabixFile(gnomad_file)
     self.frequencies = pd.read_csv(frequency_table, sep="\t", header=None)
     self.frequencies.columns = ["CHR:POS", "REF", "ALT", "AF"]
     self.frequencies[["CHR",
                       "POS"]] = self.frequencies["CHR:POS"].str.split(
                           ":", expand=True)
     self.frequencies["POS"] = self.frequencies["POS"].astype(int)
def get_interval_data(regions, INT):
    '''
    Get interval data for region for input files in int_fns. 
    Computes mean data value in 100 kb window around region midpoint.
    '''
    for fn, name in INT:
        tb = TabixFile(fn)

        for r in regions:

            # Get region for searching replication timing data
            r_len = r.length
            midp = round((r.start + r.stop) / 2)
            min_width = 10e3  # search region at least 1 kb
            if r_len < min_width:
                start = midp - round(min_width / 2)
                stop = midp + round(min_width / 2)
                rstr = '%s:%d-%d' % (r.chrom, start, stop)
            else:
                rstr = r.region_string

            try:
                it_regions = tb.fetch(rstr)
            except ValueError:  # handle regions where no interval can be made
                r.intervalData[name] = None
                continue
            intData = []
            for rtr in it_regions:
                if rtr == '': continue
                intData.append(float(rtr.split('\t')[-1]))

            if len(intData) > 0:
                r.intervalData[name] = np.mean(intData)
                continue
            else:
                # Extend search if value not found
                extends = [50e3, 100e3, 500e3, 1e6]
                found = False
                for e in extends:
                    start = max(1, midp - round(e / 2))
                    stop = midp + round(e / 2)
                    rstr = '%s:%d-%d' % (r.chrom, start, stop)

                    it_regions = tb.fetch(rstr)
                    for rtr in it_regions:
                        if rtr == '': continue
                        intData.append(float(rtr.split('\t')[-1]))
                        found = True
                    if found == True:
                        r.intervalData[name] = np.mean(intData)
                        break

                if found == False:
                    r.intervalData[name] = None

    return regions
Example #6
0
class _ALLC:
    def __init__(self, path, region):
        self.f = TabixFile(path)
        try:
            self.f_region = self.f.fetch(region)
        except ValueError:
            self.f_region = TabixIterator()

    def readline(self):
        return self.f_region.next()

    def close(self):
        self.f.close()
Example #7
0
    def __process_chromosome(self, chromosome_queue,
                             tabix_reader: pysam.TabixFile):
        vcf = VCF()
        samples = vcf.get_sample_names(self.vcf_file)
        while True:
            try:
                chromosome, size = chromosome_queue.get()
            except queue.Empty:
                time.sleep(0.1)
                continue
            if chromosome is None:
                break
            write_header = True

            window_writer = open(
                os.path.join(self.binned_output_folder,
                             chromosome + "_window.csv"), 'w')
            chunks = self.sliding_window_generator(size)
            print("\nScreening: {}".format(chromosome))
            for start_pos, end_pos in chunks:
                records = tabix_reader.fetch(chromosome,
                                             start_pos,
                                             end_pos,
                                             multiple_iterators=True)
                vcf_arr = [SNP(line, samples) for line in list(records)]
                alleles_window_sample_dict = self.determine_alleles(
                    vcf_arr, samples)
                self.__write_window_to_file(alleles_window_sample_dict,
                                            window_writer, chromosome,
                                            start_pos, end_pos, write_header)
                write_header = False
            window_writer.close()
Example #8
0
    def match_clinvar(self) -> None:
        """Match the input variant with the ClinVar table.

        Update :attr:`CharGerResult.clinvar` the variant matches a ClinVar record
        by calling :meth:`_match_clinvar_one_variant`.
        """
        if self.config.clinvar_table is None:
            logger.info("Skip matching ClinVar")
            return
        logger.info(
            f"Match input variants with ClinVar table at {self.config.clinvar_table}"
        )
        clinvar_match_num = 0
        with TabixFile(str(self.config.clinvar_table),
                       encoding="utf8") as tabix:
            cols = tabix.header[0][len("#"):].split("\t")
            for result in self.results:
                record = self._match_clinvar_one_variant(
                    result.variant, tabix, cols)
                if record is not None:
                    result.clinvar = record
                    clinvar_match_num += 1
        logger.success(
            f"Matched {clinvar_match_num:,d} out of {len(self.input_variants):,d} input variants to a ClinVar record"
        )
Example #9
0
def get_allc_lambda_frac(allc_list, num_upstr_bases):
    num_upstr_bases = int(num_upstr_bases)
    records = {}
    for path in allc_list:
        mc_counts = defaultdict(int)
        cov_counts = defaultdict(int)
        with TabixFile(str(path)) as allc:
            cell = pathlib.Path(path).name.split('.')[0]
            try:
                for line in allc.fetch('chrL'):
                    chrom, pos, strand, context, mc, cov, _ = line.split('\t')
                    # this will lead to only four contexts: CA, CC, CT, CG
                    context = context[num_upstr_bases:num_upstr_bases + 2]
                    mc_counts[context] += int(mc)
                    cov_counts[context] += int(cov)
                df = pd.DataFrame({'mc': pd.Series(mc_counts), 'cov': pd.Series(cov_counts)})
                df = df.reindex(['CG', 'CC', 'CT', 'CA']).fillna(0)  # reindex to make all four context exist
                cy_cov = df.loc['CT', 'cov'] + df.loc['CC', 'cov']
                if cy_cov > 0:
                    cy_frac = (df.loc['CT', 'mc'] + df.loc['CC', 'mc']) / cy_cov
                else:
                    cy_frac = 0
                records[cell] = {'LambdaCYFrac': cy_frac, 'LambdaCYCov': cy_cov}
            except ValueError:
                # no chrL lines
                records[cell] = {'LambdaCYFrac': 0, 'LambdaCYCov': 0}
    records = pd.DataFrame(records).T
    return records
Example #10
0
def clinvar_tabix(test_root):
    return TabixFile(
        str(
            test_root.joinpath(
                "examples/annotations/clinvar_chrom_22_only.b37.tsv.gz")),
        encoding="utf8",
    )
Example #11
0
def get_snp_data(*args, **kwargs):
    '''
    proxy for TabixFile.fetch
    '''
    kwargs['multiple_iterators'] = True
    return TabixFile(SNP_FILE, parser=asTuple()).\
            fetch(*args, **kwargs)
Example #12
0
def vcf2chain(input_file, fasta_file, strain, output_file, vcf_keep=False, passed=False, quality=False, diploid=False):
    """

    :param input_file:
    :param fasta_file:
    :param strain:
    :param output_file:
    :param vcf_keep:
    :param passed:
    :param quality:
    :param diploid:
    :return:
    """
    start = time.time()

    input_file = g2g_fu.check_file(input_file)
    fasta_file = g2g_fu.check_file(fasta_file)

    if not strain:
        raise G2GValueError("No strain was specified.")

    output_file = g2g_fu.check_file(output_file, 'w')
    output_file_dir = os.path.dirname(output_file)

    LOG.info("VCF FILE: {0}".format(input_file))
    LOG.info("FASTA FILE: {0}".format(fasta_file))
    LOG.info("CHAIN FILE: {0}".format(output_file))

    vcf_discard_file = None

    if vcf_keep:
        vcf_discard_file = "{0}.errors.vcf".format(os.path.basename(input_file))
        vcf_discard_file = os.path.join(output_file_dir, vcf_discard_file)
        LOG.info("VCF DISCARD FILE: {0}".format(vcf_discard_file))

    LOG.info("STRAIN: {0}".format(strain))
    LOG.info("PASS FILTER ON: {0}".format(str(passed)))
    LOG.info("QUALITY FILTER ON: {0}".format(str(quality)))
    LOG.info("DIPLOID: {0}".format(str(diploid)))

    if not isinstance(fasta_file, FastaFile):
        fasta_file = FastaFile(fasta_file)

    tb = TabixFile(input_file)
    sample_index = None

    for h in tb.header:
        if h[:6] == '#CHROM':
            try:
                elems = h.split('\t')
                samples = elems[9:]
                samples = dict(zip(samples, (x for x in xrange(len(samples)))))
                sample_index = samples[strain]
            except KeyError, ke:
                raise G2GVCFError("Unknown strain '{0}', valid strains are: {1}".format(strain, ", ".join(samples)))
Example #13
0
    def __init__(self,
                 chrom=None,
                 pos=None,
                 id=None,
                 reference_build='GRCh38'):
        if chrom and pos and not id:
            self.chrom = (chrom_to_hgvs(chrom,
                                        reference_build=reference_build), )
            self.pos = (int(pos), )
        elif id and not (chrom or pos):
            rs_number = int(id.replace('rs', ''))
            self.chrom, self.pos = zip(*(row.split()[2:] for row in TabixFile(
                BUILD_TO_RSID[reference_build],
                index=f'{BUILD_TO_RSID[reference_build]}.csi').fetch(
                    'rs', rs_number - 1, rs_number)))
            self.pos = tuple(int(p) for p in self.pos)
        else:
            print('Invalid input parameters')

        _, _, self.id, self.ref, self.alt, _, _, self.info = zip(
            *(row.split() for chrom, pos in zip(self.chrom, self.pos)
              for row in TabixFile(BUILD_TO_VCF[reference_build]).fetch(
                  chrom, pos - 1, pos)))
Example #14
0
def tabix_region(bedpath, querybt):
    """
    Uses tabix to extract all windows spanning in querybt
    Returns: pbt.BedTool of all windows in query
    """

    # Format query
    chrom = querybt[0].chrom
    start = str(np.nanmin(querybt.cut(range(3)).to_dataframe().start))
    end = str(np.nanmin(querybt.cut(range(3)).to_dataframe().end))
    region = '{}:{}-{}'.format(chrom, start, end)

    # Extract all windows
    return pbt.BedTool('\n'.join([x for x in TabixFile(bedpath).fetch(region)]),
                       from_string=True)
Example #15
0
def get_snps(pid):
    '''
    return sequences mentioned in SNPData.csv
    '''
    coords = map(make_coord_string, snps.COORDINATES.values())
    search_args = {
        'coordinate': ','.join(coords),
        'patient': pid,
        '_count': 100000
    }
    seq_bundle = call_api('/Sequence', search_args) 
    seqs = (entry['content'] for entry in seq_bundle['entry'])
    translation_f = TabixFile(SNP_TRANSLATION_FNAME, parser=asTuple()) 
    return jsonify({
        get_rsid(translation_f, seq): seq['observedSeq']
        for seq in seqs
    })
Example #16
0
    def _match_clinvar_one_variant(
            variant: Variant, tabix: TabixFile,
            cols: List[str]) -> Optional[Dict[str, Any]]:
        """Match the variant to the given ClinVar tabix table.

        Args:
            variant: Variant to be matched
            tabix: Tabix indexed CliVar table
            cols: All ClinVar columns in the table

        Returns:
            None if no ClinVar match. When matched, returns a `dict` of the clinvar record,
            where the key ``final_clinical_significance`` stores the final clinical significance type
            in :class:`ClinicalSignificance`.
        """
        try:
            # TabixFile.fetch will raise ValueError if the given region is out of bound
            row_iter = tabix.fetch(
                region=f"{variant.chrom}:{variant.start_pos}-{variant.end_pos}"
            )
        except ValueError as e:
            # Do nothing if it's querying for a chromosome not in the ClinVar table
            if "could not create iterator for region" not in e.args[0]:
                logger.opt(
                    exception=e).debug(f"Tabix fetch ClinVar failed: {e}")
            return None

        for row in row_iter:
            record = dict(zip(cols, row.split("\t")))
            if (int(record["start"]) == variant.start_pos
                    and int(record["stop"]) == variant.end_pos
                    and record["alt"] == variant.alt_allele):
                if record["ref"] != variant.ref_allele:
                    logger.warning(
                        f"{variant!r} got a clinvar match but their reference alleles are different: "
                        f"{variant.ref_allele!r} != {record['ref']!r}")
                # Parse the clinical significance of the record
                record[
                    "final_clinical_significance"] = ClinicalSignificance.parse_clinvar_record(
                        record)
                return record
        return None
Example #17
0
class ExploreGnomad:
    def __init__(self, gnomad_file, frequency_table):
        self.gnomad = TabixFile(gnomad_file)
        self.frequencies = pd.read_csv(frequency_table, sep="\t", header=None)
        self.frequencies.columns = ["CHR:POS", "REF", "ALT", "AF"]
        self.frequencies[["CHR",
                          "POS"]] = self.frequencies["CHR:POS"].str.split(
                              ":", expand=True)
        self.frequencies["POS"] = self.frequencies["POS"].astype(int)

    def search_position(self, chr, pos, ref, alt):
        query_lines = self.gnomad.fetch(chr, pos - 1, pos)
        for variant in query_lines:
            variant_split = variant.split("\t")
            var_ref, var_alt = variant_split[3:5]
            if ref == var_ref and alt == var_alt:
                info_line = variant_split[-1]
                match = re.search(";AF_nfe=([0-9.e+\\-]+);", info_line)
                if match:
                    return match.group(1)
        return None

    def search_all(self, output_path):
        nfe_AF = [None] * len(self.frequencies)
        for i, row in self.frequencies.iterrows():
            if i % 1000 == 0:
                print(f"{round(100*i/len(self.frequencies))} % Done")
            nfe_AF[i] = self.search_position(row["CHR"], row["POS"],
                                             row["REF"], row["ALT"])

        self.frequencies["nfe_AF"] = nfe_AF
        self.frequencies.to_csv(
            output_path,
            sep="\t",
            index=None,
            columns=["CHR", "POS", "REF", "ALT", "AF", "nfe_AF"])
Example #18
0
 def __init__(self, path, region):
     self.f = TabixFile(path)
     try:
         self.f_region = self.f.fetch(region)
     except ValueError:
         self.f_region = TabixIterator()
Example #19
0
 def _open_dataset(self):
     self._dataset = TabixFile(self._urlpath)
Example #20
0
def main():
    """
    Main block
    """

    # Parse command line arguments and options
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('gtf', help='GTF of genes to consider.')
    parser.add_argument('pext', help='BED of pext scores. Must be tabixed.')
    parser.add_argument('--min-pext',
                        default=0.1,
                        type=float,
                        help='Minimum mean pext score to retain exon. ' +
                        '[default: 0.1]')
    parser.add_argument('-o',
                        '--outgtf',
                        help='Path to output GTF file. ' + '[default: stdout]')
    parser.add_argument('-z',
                        '--bgzip',
                        dest='bgzip',
                        action='store_true',
                        help='Compress output GTF with bgzip.')
    parser.add_argument('--lost-genes',
                        help='Path to output file listing genes ' +
                        'lost due to pext filtering.')

    args = parser.parse_args()

    # Open connection to output file
    if args.outgtf is None \
    or args.outgtf in 'stdout -'.split():
        outgtf = stdout
    else:
        if path.splitext(
                args.outgtf)[-1] in '.gz .bz .bgz .bgzip .gzip'.split():
            outgtf_path = path.splitext(args.outgtf)[0]
        else:
            outgtf_path = args.outgtf

    # Load GTF & pext data
    gtfbt, genes, transcripts = load_gtf(args.gtf)
    pext = TabixFile(args.pext)

    # Apply pext filter
    gtfbt, filter_stats = pext_filter(gtfbt, pext, genes, args.min_pext)
    gtfbt.saveas(outgtf_path)
    filt_msg = 'Finished. Removed {:,} exons, resulting in the loss of {:,} genes.'
    print(
        filt_msg.format(filter_stats['n_exons_lost'],
                        filter_stats['n_genes_lost']) + '\n')

    # Bgzip output GTF, if optioned
    if args.outgtf is not None \
    and args.outgtf not in 'stdout -'.split() \
    and args.bgzip:
        subprocess.run(['bgzip', '-f', outgtf_path])

    # Write out list of lost genes, if optioned
    if args.lost_genes is not None:
        with open(args.lost_genes, 'w') as lost_out:
            for gene in filter_stats['genes_lost']:
                lost_out.write(gene + '\n')
Example #21
0
GENOME_APP_NAME = basename(GENOME_APP_DIRECTORY_PATH)

INPUT_DIRECTORY_PATH = join(GENOME_APP_DIRECTORY_PATH, 'input')
PERSON_DIRECTORY_PATH = join(INPUT_DIRECTORY_PATH, 'person')
GRCH_DIRECTORY_PATH = join(INPUT_DIRECTORY_PATH, 'grch')

TOOLS_DIRECTORY_PATH = join(GENOME_APP_DIRECTORY_PATH, 'tools')
OUTPUT_DIRECTORY_PATH = join(GENOME_APP_DIRECTORY_PATH, 'output')
MEDIA_DIRECTORY_PATH = join(GENOME_APP_DIRECTORY_PATH, 'media')

REGION_FILE = join(INPUT_DIRECTORY_PATH, 'non_PAR_region.bed')

VCF_FILE = join(PERSON_DIRECTORY_PATH, 'genome.vcf.gz')

tbx = TabixFile(VCF_FILE)


def get_format_index():
    """
    Get the index of the FORMAT field in VCF file
    Returns:
        int: index of FORMAT field
    """
    try:
        for row in tbx.header:
            line = row.decode('UTF-8')
            if line.startswith('#CHROM') and 'FORMAT' in line:
                index = line.split('\t').index('FORMAT')
                return index
    except NameError:
Example #22
0
 def _fetch(self, region):
     if not self.has_tabix:
         raise Exception('Currently, tabix is required for region query')
     with TabixFile(self._gtf, parser=self._parser) as tabix:
         for row in tabix.fetch(region):
             yield row
Example #23
0
contig = None
if args.chrom in contigs:
    contig = contigs[args.chrom]
elif 'chr%s' % args.chrom in contigs:
    contig = contigs['chr%s' % args.chrom]
else:
    raise Exception('Trouble finding contig', args.chrom, 'in', contig_names)
print('Chrom length', contig.length)

vcf_files = [args.vcf_file]
if args.additional_vcf_files is not None:
    vcf_files.extend(args.additional_vcf_files)

if np.all([os.path.isfile(vcf_file + '.tbi') for vcf_file in vcf_files]):
    vcfs = [TabixFile(vcf_file, parser=None) for vcf_file in vcf_files]

    if args.batch_size != -1:
        start_pos, end_pos = args.batch_num * args.batch_size, (
            args.batch_num + 1) * args.batch_size
        print('Interval', start_pos, end_pos)
        if start_pos < contig.length:
            process_body(
                itertools.chain(*[
                    vcf.fetch(
                        reference=contig.name, start=start_pos, end=end_pos)
                    for vcf in vcfs
                ]), sample_ids)
        else:
            print('Interval (%d-%d) is longer than chromosome (length=%d).' %
                  (start_pos, end_pos, contig.length))
Example #24
0
def pair_bins(query_bins,
              all_bins,
              outfile,
              max_dist,
              exclusion_list,
              excl_buffer,
              annotate_dist,
              sort_features,
              annotate_absdiff,
              maxfloat,
              bgzip,
              input_has_header=True):
    """
    Create pairs of bins from input BED
    """

    # Open connection to infiles & outfile
    if determine_filetype(query_bins) == 'compressed-bed':
        fin = gzip.open(query_bins, 'rt')
    else:
        fin = open(query_bins)
    if input_has_header:
        colnames = [
            k.replace('#', '') for k in fin.readline().rstrip().split('\t')
        ]
    if all_bins is None:
        bins_tabix = TabixFile(bins)
    else:
        bins_tabix = TabixFile(all_bins)
    xbt = load_exclusion_bts(exclusion_list, excl_buffer)

    # Open connection to output file
    out_ftype, out_ext = determine_filetype(outfile, return_extension=True)
    if 'compressed' in out_ftype:
        outpath = outfile.replace(out_ext, 'bed')
    else:
        outpath = outfile
    fout = open(outpath, 'w')

    # Format header and write to outfile
    header = '#chr start end'.split()
    if annotate_dist:
        header.append('distance')
    for fname in colnames[3:]:
        if sort_features:
            fname_suffixes = ['min', 'max']
        else:
            fname_suffixes = ['left', 'right']
        if annotate_absdiff:
            fname_suffixes.append('absdiff')
        header += ['_'.join([fname, v]) for v in fname_suffixes]
    fout.write('\t'.join(header) + '\n')

    # Identify and curate all pairs for each bin in fin
    for query_line in fin.readlines():
        query_vals = query_line.rstrip().split('\t')
        new_pairs = _get_pairs(fout, query_vals, bins_tabix, max_dist, xbt,
                               annotate_dist, sort_features, annotate_absdiff,
                               maxfloat)

    # Clean up
    fout.close()
    if bgzip:
        bgz(outpath)
Example #25
0
 def setup(cls, source):
     curr = cls(source)
     curr.fs = [TabixFile(i, parser=asBed()) for i in curr.source]
     return curr
Example #26
0
    def main(self, args):
        command.Command.main(self, args)
        self.validate(args)
        for i in [1, 2]:
            attr = "pop%d" % i
            pid, ary = getattr(args, attr)
            if len(ary) == 1 and ary[0][0] == "@":
                setattr(args, attr, SampleList(
                    pid, open(ary[0][1:], "rt").read().strip().split("\n")))
        pop_d = dict([args.pop1, args.pop2])
        for pid in pop_d:
            if pop_d[pid]:
                c = Counter(pop_d[pid])
                if max(c.values()) > 1:
                    raise RuntimeError(
                        "Population %s has duplicated samples: %s" %
                        (pid, [item for item in c.items() if item[1] > 1]))
        dist = [[], []]
        if not args.d:
            first_sid = args.pop1.samples[0]
            args.d = [first_sid] * 2
        args.d = [args.d[0] + ":0", args.d[1] + ":1"]
        all_samples = set(args.pop1.samples) | set(args.pop2.samples)
        for sid_i in args.d:
            sid, i = sid_i.split(":")
            i = int(i)
            if sid not in all_samples:
                raise RuntimeError("%s is not in the sample list" % sid)
            if sid in args.pop1.samples:
                d = dist[0]
            else:
                assert sid in args.pop2.samples
                d = dist[1]
            d.append((sid, i))
        undist = [[(k, i) for k in p.samples for i in (0, 1) if (k, i) not in d]
                  for p, d in zip((args.pop1, args.pop2), dist)]
        npop = 1

        def print_pop(i):
            logger.info("Population %d:" % i)
            logger.info("Distinguished lineages: " +
                        ", ".join("%s:%d" % t for t in dist[i - 1]))
            logger.info("Undistinguished lineages: " +
                        ", ".join("%s:%d" % t for t in undist[i - 1]))
        print_pop(1)
        if args.pop2.pid is not None:
            npop = 2
            common = set(args.pop1.samples) & set(args.pop2.samples)
            if common:
                logger.error("Populations 1 and 2 should be disjoint, "
                             "but both contain " + ", ".join(common))
                sys.exit(1)
            print_pop(2)

        # Start parsing
        vcf = VariantFile(args.vcf)
        with optional_gzip(args.out, "wt") as out:
            samples = list(vcf.header.samples)
            dist = dist[:npop]
            undist = undist[:npop]
            if not set([dd[0] for d in dist for dd in d]) <= set(samples):
                raise RuntimeError("Distinguished lineages not found in data?")
            missing = [s for u in undist for s, _ in u if s not in samples]
            if missing:
                msg = "The following samples were not found in the data: %s. " % ", ".join(
                    missing)
                if args.ignore_missing:
                    logger.warn(msg)
                else:
                    msg += "If you want to continue without these samples, use --ignore-missing."
                    raise RuntimeError(msg)
            undist = [[t for t in u if t[0] not in missing] for u in undist]

            # Write header
            pids = [a.pid for a in (args.pop1, args.pop2)[:npop]]
            out.write("# SMC++ ")
            json.dump({"version": version, "pids": pids,
                       "undist": undist, "dist": dist}, out)
            out.write("\n")
            na = list(map(len, dist))
            nb = list(map(len, undist))

            # function to convert a VCF record to our format:
            # <span, dist gt, # undist gt, # undist, [...]>
            def rec2gt(rec):
                ref = rec.alleles[0]
                for di in dist:
                    for d, i in di:
                        if len(rec.samples[d].alleles) != 2:
                            raise RuntimeError(
                                "Expected a diploid genotype at position {} "
                                "for individual {} but found:\n{}".format(rec.pos, d,
                                    list(rec.samples[d].alleles)))
                da = [[rec.samples[d].alleles[i]
                   for d, i in di] for di in dist]
                a = [sum([x != ref for x in d])
                     if None not in d else -1 for d in da]
                bs = [[rec.samples[d].alleles[i] != ref
                       for d, i in un
                       if rec.samples[d].alleles[i] is not None]
                      for un in undist]
                b = [sum(_) for _ in bs]
                nb = [len(_) for _ in bs]
                # Fold non-polymorphic (in subsample) sites
                if np.array_equal(b, nb) and np.array_equal(a, na):
                    a = [0] * len(a)
                    b = [0] * len(b)
                return list(sum(zip(a, b, nb), tuple()))

            try:
                region_iterator = vcf.fetch(contig=args.contig)
            except ValueError as e:
                logger.error("VCF reader threw an error: %s", e)
                logger.error("Make sure the VCF is indexed:")
                logger.error("")
                logger.error("    $ tabix %s", args.vcf)
                logger.error("")
                sys.exit(1)

            contig_length = args.length or vcf.header.contigs[args.contig].length
            if contig_length is None:
                logger.error("Failed to acquire contig length from VCF header. See the --length option.")
                sys.exit(1)
            if args.mask:
                mask_iterator = TabixFile(
                    args.mask).fetch(reference=args.contig)
                args.missing_cutoff = np.inf
            else:
                mask_iterator = iter([])
                if args.missing_cutoff is None:
                    args.missing_cutoff = np.inf
            mask_iterator = (x.split("\t") for x in mask_iterator)
            mask_iterator = ((x[0], int(x[1]), int(x[2]))
                             for x in mask_iterator)
            snps_only = (
                rec for rec in region_iterator if
                len(rec.alleles) <= 2 and
                all(len(a) == 1 for a in rec.alleles)
                )

            def interleaved():
                cmask = next(mask_iterator, None)
                csnp = next(snps_only, None)
                while cmask or csnp:
                    if cmask is None:
                        yield "snp", csnp
                        csnp = next(snps_only, None)
                    elif csnp is None:
                        yield "mask", cmask
                        cmask = next(mask_iterator, None)
                    else:
                        if csnp.pos < cmask[1]:
                            yield "snp", csnp
                            csnp = next(snps_only, None)
                        elif csnp.pos <= cmask[2]:
                            while csnp is not None and csnp.pos <= cmask[2]:
                                csnp = next(snps_only, None)
                            yield "mask", cmask
                            cmask = next(mask_iterator, None)
                        else:
                            yield "mask", cmask
                            cmask = next(mask_iterator, None)

            abnb_miss = [-1, 0, 0] * len(nb)
            abnb_nonseg = sum([[0, 0, x] for x in nb], [])
            multiples = set()
            with RepeatingWriter(out) as rw, \
                    tqdm.tqdm(total=contig_length, unit='bases', unit_scale=True) as bar:
                def write(x):
                    if not write.first or not args.drop_first_last:
                        rw.write(x)
                    write.first = False
                write.first = True
                last_pos = 0
                for ty, rec in interleaved():
                    if ty == "mask":
                        span = rec[1] - last_pos
                        write([span] + abnb_nonseg)
                        write([rec[2] - rec[1] + 1] + abnb_miss)
                        last_pos = rec[2]
                        continue
                    bar.update(rec.pos - last_pos)
                    abnb = rec2gt(rec)
                    if rec.pos == last_pos:
                        multiples.add(rec.pos)
                        continue
                    span = rec.pos - last_pos - 1
                    if 1 <= span <= args.missing_cutoff:
                        write([span] + abnb_nonseg)
                    elif span > args.missing_cutoff:
                        write([span] + abnb_miss)
                    write([1] + abnb)
                    last_pos = rec.pos
                if not args.drop_first_last:
                    write([contig_length - last_pos] + abnb_nonseg)
            if multiples:
                # FIXME: what to do with multiple records at same site
                logger.warn(
                    "Multiple entries found at %d positions; skipped all but the first", len(multiples))
Example #27
0
                continue
            row['disease'] = disease[0:-1]
            snps[row['SNP']] = row

    with open('DrugInfo.csv') as src:
        drug_info = {row['SNP']: row for row in csv.DictReader(src)}

    with open('okg.ped') as pop_src:
        # mapping: sample id -> population id
        populations = {
            indiv['Individual ID']: indiv['Population']
            for indiv in csv.DictReader(pop_src, delimiter='\t')
        }

    print 'Determining genomic coordinates for sequences.'
    f = TabixFile('snps.sorted.txt.gz', parser=asTuple())
    snp_table = {}
    for row in f.fetch():
        _, snp, chrom, pos = row
        if snp in snps or snp in drug_info:
            snp_table[snp] = {'chromosome': chrom, 'pos': int(pos)}
    with open('snps.py', 'w') as dump:
        dump.write(WARNING)
        dump.write('COORDINATES = %s\n' % snp_table)
        dump.write('DATA = %s\n' % snps)
        dump.write('DRUG_INFO = %s\n' % drug_info)
    print 'Data written to snps.py'
    print 'Determining allele frequencies (using data from 1000 Genomes)'
    genotypes = {snp: snp_data['Code'] for snp, snp_data in snps.iteritems()}
    variants = list(
        ga4gh.search_variants(genotypes, dataset=ga4gh.OKG, repo_id='google'))
Example #28
0
from pysam import VariantFile
from pysam import TabixFile
from pyfaidx import Fasta

# data files
reference_file = 'S_lycopersicum_chromosomes.2.40.fa'
annotation_file = 'gene_models.gff.gz'
variant_file = 'tomato_snps.bcf'

# load reference
reference = Fasta(reference_file)

# load annotations
annotations = TabixFile(annotation_file)

# laod variants
variants = VariantFile(variant_file)

# regions to query
region1 = ("SL2.40ch01", 15000, 21000)
region2 = ("SL2.40ch01", 20000, 70000)

region1_reference = reference[region1[0]][region1[1]: region1[2]]
region1_annotations = [a for a in annotations.fetch(*region1, parser=pysam.asGTF())]
region1_variants = [a for a in variants.fetch(*region1)]

region2_reference = reference[region2[0]][region2[1]: region2[2]]
region2_annotations = [a for a in annotations.fetch(*region2, parser=pysam.asGTF())]
region2_variants = [a for a in variants.fetch(*region2)]
Example #29
0
class IndexedBedFile(DataSource):
    name = "indexed_bedfile"
    version = "0.1.0"
    container = "dataframe"
    partition_access = False
    description = "A bgzipped and indexed bedfile"

    def __init__(self, urlpath, include_unmapped=True, metadata=None):
        self._urlpath = urlpath
        self._include_unmapped = include_unmapped
        self._dataset = None
        self._dtype = None
        self._chroms = None
        super(IndexedBedFile, self).__init__(metadata=metadata)

    def _open_dataset(self):
        self._dataset = TabixFile(self._urlpath)

    def _get_schema(self):
        if self._dataset is None:
            self._open_dataset()
        self._chroms = list(self._dataset.contigs)

        rec = next(self._dataset.fetch(self._chroms[0], parser=asTuple()))
        num_fields = len(rec)

        chrom_coord_dtype = np.int64
        dtypes = {
            "chrom": pd.CategorialDtype(self._chroms + ["NULL"], ordered=True),
            "start": chrom_coord_dtype,
            "end": chrom_coord_dtype,
            "name": str,
            "score": np.float32,
            "strand": bool,
        }
        self._dtype = {
            key: dtypes[key]
            for key in list(dtypes.keys())[:num_fields]
        }
        return Schema(
            datashape=None,
            dtype=self._dtype,
            shape=(None, len(self._dtype)),
            npartitions=len(self._chroms),
            extra_metadata={},
        )

    def _get_partition(self, i):
        chrom = self._chroms[i]
        columns = list(self._dtype.keys())
        return pd.DataFrame(list(self._dataset.fetch(chrom, parser=asTuple())),
                            columns=columns).astype(self._dtype)

    def read(self):
        self._load_metadata()
        return pd.concat(
            [self.read_partition(i) for i in range(self.npartitions)],
            ignore_index=True)

    def _close(self):
        # close any files, sockets, etc
        if self._dataset is not None:
            self._dataset.close()
Example #30
0
            if row['Chromosome'] is None:
                disease = row['SNP']
                continue
            row['disease'] = disease[0:-1]
            snps[row['SNP']] = row 
    
    with open('DrugInfo.csv') as src:
        drug_info = {row['SNP']: row for row in csv.DictReader(src)} 

    with open('okg.ped') as pop_src:
        # mapping: sample id -> population id
        populations = {indiv['Individual ID']: indiv['Population']
                for indiv in csv.DictReader(pop_src, delimiter='\t')}

    print 'Determining genomic coordinates for sequences.'
    f = TabixFile('snps.sorted.txt.gz', parser=asTuple()) 
    snp_table = {}
    for row in f.fetch():
        _, snp, chrom, pos = row
        if snp in snps or snp in drug_info:
            snp_table[snp] = {
                'chromosome': chrom,
                'pos': int(pos)
            } 
    with open('snps.py', 'w') as dump:
        dump.write(WARNING)
        dump.write('COORDINATES = %s\n'% snp_table)
        dump.write('DATA = %s\n'% snps)
        dump.write('DRUG_INFO = %s\n'% drug_info)
    print 'Data written to snps.py' 
    print 'Determining allele frequencies (using data from 1000 Genomes)'
Example #31
0
 def setup(cls, source):
     curr = cls(source)
     curr.f = TabixFile(curr.source, parser=asBed())
     return curr