Exemple #1
0
def alt_fasta():
    ''' A simple fasta that has all alt calls for smpl_annot'''
    fasta = Fasta('alt_fasta')
    chr1 = Chromosome('chr1','C'*500000)
    chr2 = Chromosome('chr2','G'*500000)
    chr3 = Chromosome('chr3','T'*500000)
    chr4 = Chromosome('chr4','A'*500000)
    fasta.add_chrom(chr1,force=True)   
    fasta.add_chrom(chr2,force=True)   
    fasta.add_chrom(chr3,force=True)   
    fasta.add_chrom(chr4,force=True)   
    return fasta
Exemple #2
0
def smpl_fasta():
    ''' A simple fasta that agrees with smpl_annot'''
    fasta = Fasta('smpl_fasta')
    chr1 = Chromosome('chr1','A'*500000)
    chr2 = Chromosome('chr2','C'*500000)
    chr3 = Chromosome('chr3','G'*500000)
    chr4 = Chromosome('chr4','T'*500000)
    fasta.add_chrom(chr1,force=True)   
    fasta.add_chrom(chr2,force=True)   
    fasta.add_chrom(chr3,force=True)   
    fasta.add_chrom(chr4,force=True)   
    return fasta
Exemple #3
0
def smpl_fasta():                                                                  
    ''' A simple fasta that agrees with smpl_annot'''                           
    m80tools.delete('Fasta','smpl_fasta',force=True)
    fasta = Fasta('smpl_fasta')                                                             
    chr1 = Chromosome('chr1','A'*500000)                                                  
    chr2 = Chromosome('chr2','C'*500000)                                               
    chr3 = Chromosome('chr3','G'*500000)                                               
    chr4 = Chromosome('chr4','T'*500000)                                                  
    fasta.add_chrom(chr1)                                                
    fasta.add_chrom(chr2)                                                
    fasta.add_chrom(chr3)                                                
    fasta.add_chrom(chr4)                                                
    fasta._add_nickname('chr1','CHR1')
    fasta._add_attribute('chr1','foobar')
    return fasta  
def print_flank_sequence(vcf,fasta,out,window=30):                                     
    vcf = pysam.VariantFile(vcf)                                                   
    if not available("Fasta",'temp'): 
        fasta = Fasta.from_file('temp',fasta)                                                 
    else:
        fasta = Fasta('temp')
    

    with open(out,'w') as OUT:
        for i,var in enumerate(vcf):                                                   
            # Grab the FASTA flank sequence
            if fasta[var.chrom][var.pos] != var.ref:
                print(f"{var.id} REF does not match the FASTA. oops.")
            # Grab the probe sequence anyways
            kmer = ''.join(fasta[var.chrom][var.pos+1:var.pos+window]).upper()
            print(">{}.{}_{}\n{}".format(var.chrom, var.pos, "1", kmer),file=OUT)
            kmer = ''.join(fasta[var.chrom][var.pos-window:var.pos-1]).upper()
            print(">{}.{}_{}\n{}".format(var.chrom, var.pos, "2", kmer),file=OUT)                       
Exemple #5
0
def m80_Fasta():
    '''
        Create a Fasta which doesn't get 
        returned. Access the Fasta through
        the m80 API
    '''
    # delete the onl
    m80tools.delete('Fasta','ACGT',force=True)
    f = Fasta.from_file('ACGT','raw/ACGT.fasta')
    return True
Exemple #6
0
    def _add_fasta(self, fasta):
        '''
            Add a reference geneome sequence to the database.
            This reference sequence will be used to sort and 
            conform the genotype (VCF) files used for imputation.
            A fasta can only be attached once to a hapdab.

            Parameters
            ----------
            fasta : str, path-like or a locuspocus.Fasta object
                Path to the fasta file

            Returns
            -------
            None if successful. See the API for accessing the
            fasta object.

            Raises
            ------
            ValueError if a fasta has already been assigned to the
            database.

        '''
        if 'Fasta' in self._dict:
            raise ValueError(
                'A Fasta has already been assigned to this database!')
        if os.path.exists(fasta):
            f = Fasta.from_file(self._m80_name, fasta, parent=self)
        elif m80.Tools.available('Fasta', fasta):
            f = Fasta(fasta)
            os.symlink(f._basedir,
                       os.path.join(self._basedir, f'Fasta.{f._m80_name}'))
        else:
            raise ValueError(f'Unable to determine the type of fasta')
        self._fasta = f
        # and remember for next time
        self._dict['Fasta'] = f._m80_name
Exemple #7
0
 def __init__(self, name, parent=None):
     super().__init__(name)
     if 'Fasta' in self._dict:
         self._fasta = Fasta(self._dict['Fasta'], parent=self)
Exemple #8
0
class HapDab(Freezable):

    # Init method
    log = logging.getLogger(__name__)
    handler = logging.StreamHandler()
    formatter = logging.Formatter(
        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
    handler.setFormatter(formatter)
    log.addHandler(handler)
    log.setLevel(logging.INFO)

    def __init__(self, name, parent=None):
        super().__init__(name)
        if 'Fasta' in self._dict:
            self._fasta = Fasta(self._dict['Fasta'], parent=self)

    # --------------------------------------------
    # Class Methods
    # --------------------------------------------

    @classmethod
    def from_files(cls, name, vcffile, fastafile, skip_phase=False):
        self = cls(name)
        self._add_fasta(fastafile)
        self._add_ref_vcf(vcffile, skip_phase=skip_phase)
        return self

    # --------------------------------------------
    # Public Methods
    # --------------------------------------------

    def impute(self, vcf_file):
        '''
            Impute a VCF file to the density of the 
            Reference VCF
        '''
        # Get a temp file for the output
        imputed_vcf = self._tmpfile()
        # Beagle needs a file with one common samples to exclude
        #self.log.info('Finding common samples to exclude')
        #common_sample = self._tmpfile()
        #with open(vcf_file,'r') as IN:
        #    for line in IN:
        #        if line.startswith('#CHROM'):
        #            fields = line.split()
        #            self.log.info(f'Found {len(fields[9:])} samples in common')
        #            common_sample.write("\n".join(fields[9:]))
        #            common_sample.flush()
        #            break
        #        elif line.startswith('##'):
        #            continue
        #        else:
        #            raise Exception('Could not find sample IDS in VCF header')
        # Get the path of BEAGLE
        beagle_path = pkg_resources.resource_filename(
            'hapdab', 'include/beagle/beagle.28Sep18.793.jar')
        ref_vcf = os.path.join(self._basedir, 'reference.vcf.gz')
        # Create a command
        cmd = (f"java -jar {beagle_path} impute=true ref={ref_vcf} "
               f"gt={vcf_file} out={imputed_vcf.name}").split()
        phaser = subprocess.run(cmd)
        if phaser.returncode != 0:
            raise ValueError("Imputation failed!")
        return imputed_vcf.name + '.vcf.gz'

    # --------------------------------------------
    # Private Methods
    # --------------------------------------------

    def _add_fasta(self, fasta):
        '''
            Add a reference geneome sequence to the database.
            This reference sequence will be used to sort and 
            conform the genotype (VCF) files used for imputation.
            A fasta can only be attached once to a hapdab.

            Parameters
            ----------
            fasta : str, path-like or a locuspocus.Fasta object
                Path to the fasta file

            Returns
            -------
            None if successful. See the API for accessing the
            fasta object.

            Raises
            ------
            ValueError if a fasta has already been assigned to the
            database.

        '''
        if 'Fasta' in self._dict:
            raise ValueError(
                'A Fasta has already been assigned to this database!')
        if os.path.exists(fasta):
            f = Fasta.from_file(self._m80_name, fasta, parent=self)
        elif m80.Tools.available('Fasta', fasta):
            f = Fasta(fasta)
            os.symlink(f._basedir,
                       os.path.join(self._basedir, f'Fasta.{f._m80_name}'))
        else:
            raise ValueError(f'Unable to determine the type of fasta')
        self._fasta = f
        # and remember for next time
        self._dict['Fasta'] = f._m80_name

    def _add_ref_vcf(self, vcf_file, skip_conform=False, skip_phase=False):
        '''
            Add a reference VCF to the database.
        '''
        original_vcf = vcf_file
        if not skip_conform:
            conformed = self._conform_vcf(vcf_file)
            vcf_file = conformed.name
        if not skip_phase:
            phased = self._phase_vcf(vcf_file)
            vcf_file = phased.name + '.vcf.gz'
        # Copy the file
        dest_path = os.path.join(self._basedir, 'reference.vcf.gz')
        try:
            # peek will return if gzipped
            gzip.open(vcf_file).peek(1)
            shutil.copy(vcf_file, dest_path)
        except OSError as e:
            # gzip will throw an AttributeError excpetion if not gzipped
            self.log.info('compressing the output file')
            with open(vcf_file, 'rb') as IN:
                with gzip.open(dest_path, 'wb', compresslevel=6) as OUT:
                    shutil.copyfileobj(IN, OUT, length=512 * (2**20))
            self.log.info('done compressing')

    def _phase_vcf(self, vcf_file):
        '''
            Phase a VCF file using beagle

            Parameters
            ----------
            vcf_file : str, path-like
                A path to a VCF file

            Returns
            -------
            A handle to a temporary file containing phased data
        '''
        # Get a temp file for the output
        phased_vcf = self._tmpfile()
        # Get the path of BEAGLE
        beagle_path = pkg_resources.resource_filename(
            'hapdab', 'include/beagle/beagle.28Sep18.793.jar')
        # Create a command
        cmd = (f"java -jar {beagle_path} gt={vcf_file} out={phased_vcf.name}"
               ).split()
        phaser = subprocess.run(cmd)
        if phaser.returncode != 0:
            raise ValueError("Phasing failed!")
        return phased_vcf

    def _conform_vcf(self, vcf_file):
        '''
            Conforms and sorts an input vcf based on the fasta

            Parameters
            ----------
            vcf_file : path
                The path the VCF file

            Returns
            -------
            A named temp file containing the sorted vcf
        '''
        headers = list()
        variants = list()
        cur_byte = 0
        chroms = list()
        temps = dict()

        self.log.info(f"Sorting {vcf_file}")
        # Get the chromosome order
        # Iterate through the chromosome keys and open temp files
        try:
            for chrom in self._fasta.chrom_names():
                temps[chrom] = self._tmpfile(suffix=f'sorted_vcf_{chrom}')
                chroms.append(chrom)
        except Exception as e:
            self.log.info(
                f'{e}: try increasing the open file limit on your system')
        # Get headers and extract positions with file byte offsets
        self.log.info(f"Reading in VCF: {vcf_file}")
        with RawFile(vcf_file) as VCF:
            for i, line in enumerate(VCF):
                if line.startswith('#CHROM'):
                    fields = line.strip().split()
                    for i in range(9, len(fields)):
                        fields[i] = fields[i] + '_ref'
                    headers.append('\t'.join(fields))
                elif line.startswith("#"):
                    headers.append(line.strip())
                else:
                    var = Variant.from_str(line)
                    #var.conform(self._fasta[var.chrom][var.pos].upper())
                    temps[var.chrom].write(str(var) + '\n')
        # close all temp files
        for key, val in temps.items():
            self.log.info(f"flushing tmp file: {key}")
            val.flush()
        self.log.info("Outputting sorted chroms")
        out = self._tmpfile(suffix='_sorted_vcf')
        # print headers
        print("\n".join(headers), file=out)
        for chrom in chroms:
            # read in that chroms bullshit
            with open(temps[chrom].name, 'r') as CHROM:
                variants = CHROM.readlines()
                # sort by position
                variants.sort(key=lambda x: int(x.split()[1]))
                self.log.info(f"printing chrom {chrom}")
                print("".join(variants), file=out, end="")
        return out