Exemple #1
0
    def group_inversions(cls, adjs):
        """Group 2 inversion adjacencies into a single event"""
        inversions = sorted(adjs,
                            key=lambda adj: (adj.chroms[0], adj.breaks[0]))

        max_homology = 25
        variants = []
        i = 0
        while i < len(inversions) - 1:
            if inversions[i].chroms[0] == inversions[i + 1].chroms[0] and\
               inversions[i + 1].breaks[0] - inversions[i].breaks[0] <= max_homology and\
               ((inversions[i].orients == ('L', 'L') and inversions[i + 1].orients == ('R', 'R')) or
                (inversions[i].orients == ('R', 'R') and inversions[i + 1].orients == ('L', 'L'))):

                (adj1,
                 adj2) = (inversions[i],
                          inversions[i + 1]) if inversions[i].orients == (
                              'L', 'L') else (inversions[i + 1], inversions[i])

                variants.append(Variant('INV', [adj1, adj2]))
                i += 2

            else:
                if not inversions[i].dubious:
                    variants.append(Variant('INV', [inversions[i]]))
                i += 1

        if i == len(inversions) - 1 and not inversions[i].dubious:
            variants.append(Variant('INV', [inversions[i]]))

        return variants
Exemple #2
0
def test_pc_iter_2():
    seq = 'AAAAAAAAAA'
    #      0123456789
    #          CG

    variants = [
        Variant('t', 't', 4, 'A', 'C', 0.25),
        Variant('t', 't', 5, 'A', 'G', 0.25)
    ]
    it = PseudocontigIterator(seq, variants, 4)
    pc = it.next()
    pcs = []
    while pc:
        pcs.append(pc)
        pc = it.next()
    assert 2 == len(pcs)
    assert 'AAACAAA' in pcs
    assert 'AACGAA' in pcs

    it = PseudocontigIterator(seq, variants, 5)
    pc = it.next()
    pcs = []
    while pc:
        pcs.append(pc)
        pc = it.next()
    assert 2 == len(pcs)
    assert 'AAAACAAAA' in pcs
    assert 'AAACGAAA' in pcs
Exemple #3
0
    def process_vcf(self, cols):
        """Build object from vcf
        """
        vcf = open(self.name, 'r')
        info_dict, format_dict = {}, {}

        # Read the meta-information lines from the vcf
        for i, line in enumerate(vcf):
            # Handle exceptions: the AF will be calcualted regardless;
            if line.startswith('##FORMAT=<ID=AF'):
                pass
            # Select the INFO/FORMAT lines
            elif line.startswith('##FORMAT'):
                vcf_header = VcfHeader(line)
                format_dict.update({vcf_header.meta_id: vcf_header})
            elif line.startswith('##INFO'):
                vcf_header = VcfHeader(line)
                info_dict.update({vcf_header.meta_id: vcf_header})
            # Keep other meta-info lines
            elif line.startswith('##'):
                if line.startswith('##source='):
                    self.caller = line.replace('##source=', '').strip()
                self.meta_info.append(line)
            else:
                break
            # Only extract the (filtered) DP in the format
            if "DP" in info_dict.keys() and format_dict.keys():
                info_dict.pop("DP", None)

        if not self.caller:
            sys.exit("Cannot identify caller from file {}\nPlease add caller \
                     identify line '##source=(caller name)' to vcf header"
                     .format(self.name))

        # When user specify the AF and vcf does not have, try to calculate that 
        # for the user
        if ('AF' in cols) and ('AF' not in info_dict.keys()):
            vcf_header = VcfHeader(AF_LINE)
            info_dict.update({vcf_header.meta_id: vcf_header})

        # Select the columns from INFO/FORMAT
        info_cols, format_cols = extract_cols(info_dict, format_dict, cols)

        # Add the INFO line (with caller) / FORMAT (unchanged) to header_list
        self.meta_info += [VcfHeader.write(VcfHeader.add_caller(v,
                           self.caller)) for k, v in info_cols.items()]
        self.meta_info += [VcfHeader.write(v) for k, v in format_cols.items()]

        self.header = line

        # Continue to read the file, this time the variants
        for j, line in enumerate(vcf):
            variant = Variant().process_variant(line, caller=self.caller)
            if variant.alt == '*':
                print("Warning: Vcf {} line {} has variant with alt=*".format(self.caller, str(i+j+1)))
            cleaned_variant = Variant.select_info(variant, info_cols, format_cols)
            # The dictionary is query by chr\tpos\tref\talt
            self.variants.update({cleaned_variant.variant_key: cleaned_variant})

        return self
def getVariants():
    # Returns initialized variants for testing
    ret = {}
    pid = "DCIS_1"
    rows = ["", ""]
    ret["1"] = [Variant(pid, "1", "100.0", "200.0", rows)]
    ret["1"].append(Variant(pid, "1", "1025", "1119", rows))
    ret["2"] = [Variant(pid, "2", "25006", "25124", rows)]
    ret["X"] = [Variant(pid, "X", "90045", "90157.5", rows)]
    return ret
Exemple #5
0
def main():
    global options, args
    separator = '|'

    # Open and parse each line of the vcf file
    input_vcf = vcf.Reader(open(options.input_vcf, 'r'))
    if options.non_model:
        variant = Variant(samples=input_vcf.samples,
                          organism_type='non_model',
                          ploidy=options.ploidy)
    else:
        variant = Variant(samples=input_vcf.samples, ploidy=options.ploidy)

    # Open output file
    with open(options.output_vcf, 'w') as output_psv:
        # Generate output file header
        #variant = ConsequenceType(input_vcf.samples)
        output_psv.write(variant.create_psv_header(separator=separator))

        # Now parse lines in .vcf and output with new format:
        for record in input_vcf:
            # Only output sites that hasn't been filtered out
            if len(record.FILTER) == 0:
                #for consequence in range(0, len(record.INFO['CSQ'])):
                variant.get_from_record(record=record)
                output_psv.write(variant.put_to_psv(separator=separator))
Exemple #6
0
    def group_trls(cls, adjs):
        """Group 2 translocation adjacencies into single reciprocal event"""
        trls = sorted([adj for adj in adjs if not adj.dubious],
                      key=lambda adj: (adj.chroms[0], adj.breaks[0]))

        grouped_trl_ids = Set()
        neighborhood = 10000
        variants = []
        i = 0
        if len(trls) > 1:
            while i < len(trls) - 1:
                if trls[i].chroms[0] == trls[i + 1].chroms[0] and\
                   trls[i].chroms[1] == trls[i + 1].chroms[1] and\
                   abs(trls[i + 1].breaks[0] - trls[i].breaks[0]) <= neighborhood and\
                   abs(trls[i + 1].breaks[1] - trls[i].breaks[1]) <= neighborhood and\
                   ((trls[i].orients == ('L', 'R') and trls[i + 1].orients == ('R', 'L')) or\
                    (trls[i].orients == ('R', 'L') and trls[i + 1].orients == ('L', 'R')) or\
                    (trls[i].orients == ('L', 'L') and trls[i + 1].orients == ('R', 'R')) or\
                    (trls[i].orients == ('R', 'R') and trls[i + 1].orients == ('L', 'L'))
                    ):
                    variants.append(Variant('TRL', [trls[i], trls[i + 1]]))
                    grouped_trl_ids.add(trls[i].id)
                    grouped_trl_ids.add(trls[i + 1].id)
                    i += 2
                else:
                    i += 1

        grouped_trl_ids = Set()
        trls_remained = [trl for trl in trls if trl.id not in grouped_trl_ids]

        return variants, trls_remained
Exemple #7
0
def main():
    if len(sys.argv) < 3:
        print(
            'usage:\nconvert <pipeline>.xml <output>.sh (arg_name=arg_value)*')
        return

    pl_file = sys.argv[1]
    script = sys.argv[2]
    args = {
        arg: Variant.from_string(value, 'string')
        for (arg, value) in [item.split('=') for item in sys.argv[3:]]
    }

    pm = PackageManager()
    include_sh = open('Test/diff_expr/include.sh', 'w')
    include_sh.write(pm.get_header())
    include_sh.close()

    pipeline = Pipeline(pl_file, pm)
    out_file = open(script, 'w')
    output = pipeline.generate(args)
    out_file.write('DIR="${BASH_SOURCE%/*}"\n. "$DIR/include.sh"\n\n')
    out_file.write(output)
    out_file.close()
    system('chmod +x ' + script)
    print(output)
Exemple #8
0
    def _get_explicit_value(self, node): 
        '''Return Variant with step's arg value'''

        if 'val' in node.attrib:
            return Variant.from_string(node.attrib['val'], node.attrib.get('type'))
        elif 'ref' in node.attrib:
            parts = node.attrib['ref'].split('.')
            if len(parts) == 1: #local variable
                option_name = parts[0]
                if option_name in self._inputs:
                    return self._inputs[option_name].get()
                if option_name in self._outputs:
                    return self._outputs[option_name].get()
                else:
                    raise RuntimeError('Reference to undefined option: ' + option_name)
            elif len(parts) == 2: #some pipeline output
                step_name, output_name = parts
                if step_name not in self._step_pipelines:
                    raise RuntimeError('Reference to undefined step: ' + step_name)
                output = self._step_pipelines[step_name]._get_output(output_name)
                
                if output is None:
                    raise RuntimeError('Undefined step output %s.%s has:%s' % (parts[0], parts[1], self._step_pipelines[parts[0]]._outputs))
                
                return output.get()
            else:
               raise RuntimeError('Wrong reference format')

        return None
Exemple #9
0
    def parse_MAF(self):
        ''' maf filetype parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        position = int(
            str(row[fieldId['Start_position']]).split('.')[0]
        )  # case sensitive. what if, 'Start_Position' instead? case-insensitive hash lookup, or make everything lowercase befor making comparisons?
        dp = int(str(row[fieldId['TTotCov']]).split('.')[0])
        vf = float(float(row[fieldId['TVarCov']]) / float(dp))
        chrom = str(row[fieldId['Chromosome']])
        ref = str(row[fieldId['Reference_Allele']])
        alt = str(row[fieldId['Tumor_Seq_Allele2']])
        effect = self.eff
        fc = self.fc
        if ref == "-":
            ref = ""
        if alt == "-":
            alt = ""
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
Exemple #10
0
    def parse_MuTectOUT(self):
        ''' MuTect '.out' parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        vf = float(row[fieldId['tumor_f']])
        dp = int(
            int(str(row[fieldId['t_ref_count']]).strip()) +
            int(str(row[fieldId['t_alt_count']]).strip()))
        position = int(row[fieldId['position']])

        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
Exemple #11
0
    def get_product_skus(self, product):
        # scrape product variants and stock status from its info
        # returns a list of variant objects
        logt(self.tid, 'fetching product variants')
        variants = []
        try:
            params = {
                "expand": "variations,informationBlocks,customisations",
                "channel": "iphone-app"
            }
            url = "https://commerce.mesh.mx/stores/{}/products/{}".format(self.sitename, product)
            r = requests.request(
                'GET',
                url,
                headers=self.headers,
                params=params
            ).json()

            for size in r['options']:
                logt(self.tid,"[size] {}  \t sku {} \t {}".format(
                    size,
                    r['options'][size]['SKU'],
                    r['options'][size]['stockStatus']
                ))
                v = Variant(
                    size,
                    r['options'][size]['SKU'],
                    r['options'][size]['stockStatus']
                )
                variants.append(v)
            return variants
        except KeyError:
            logt(self.tid,"[error] exception while getting product info json")
            exit(-1)
Exemple #12
0
    def parseVCF(self, file):
        #load the file to be parsed
        fileReader = open(file, "r")

        #loop over the file
        for line in fileReader:
            #see if line starts with # and skip
            if line.startswith("#"):
                continue

            #tokenize the line
            lineTokens = line.split("\t")

            #set up the variables just so it clear what we are using
            chromosome = lineTokens[0]
            position = int(lineTokens[1])
            id = lineTokens[2]
            referenceAllele = lineTokens[3]
            alternateAllele = lineTokens[4]
            qualityScore = float(lineTokens[5])
            filterFlag = lineTokens[6]
            infoGroup = lineTokens[7]
            formatGroup = lineTokens[8]
            noneGroup = lineTokens[9]

            #create the variant and add it
            variant = Variant(chromosome, position, id, referenceAllele,
                              alternateAllele, qualityScore, filterFlag,
                              infoGroup, formatGroup, noneGroup)
            self.__variants.append(variant)
Exemple #13
0
    def parse_SamTools(self):
        ''' samtools vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        position = int(row[fieldId['POS']])
        for i in row[fieldId['INFO']].split(';'):
            if i.startswith("DP4="):
                j = i.split('=')[1].split(',')
                ro = int(int(j[0]) + int(j[1]))
                ao = int(int(j[2]) + int(j[3]))
                dp = int(int(ro) + int(ao))
                vf = float(float(ao) / float(dp))
                var = Variant(source=fn.split('/')[-1],
                              pos=HTSeq.GenomicPosition(chrom, int(position)),
                              ref=ref,
                              alt=alt,
                              frac=vf,
                              dp=dp,
                              eff=effect.strip(';'),
                              fc=fc.strip(';'))
                return var
Exemple #14
0
    def parse_SomaticIndelDetector(self):
        ''' GATK SomaticIndelDetector vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        j = 0
        # Below attempts to grab sample ID.
        # assumes that sample ID is the final column in the self.header. always true?
        # if not always true, adopt the parse_mutect solution here as well
        tmpsampID = header[-1]

        for i in row[fieldId['FORMAT']].split(':'):
            if i == "AD":
                ALT_count = row[fieldId[tmpsampID]].split(':')[j].split(',')[1]
            elif i == "DP":
                dp = row[fieldId[tmpsampID]].split(':')[j]
                vf = float(float(ALT_count) / float(dp))
            j += 1
        position = int(row[fieldId['POS']])
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
Exemple #15
0
    def parse_VarScan(self):
        ''' varscan vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        j = 0
        position = int(row[fieldId['POS']])
        for i in row[fieldId['FORMAT']].split(':'):
            if str(i) == "DP":
                dp = int(row[fieldId[header[-1]]].split(':')[j])
            if str(i) == "FREQ":
                vf = float(
                    float(
                        str(row[fieldId[header[-1]]].split(':')[j]).strip('%'))
                    / float(100))
            j += 1
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
Exemple #16
0
    def parse_HapCaller(self):
        ''' GATK haplotype caller vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        j = 0
        position = int(row[fieldId['POS']])
        '''
        for i in row[fieldId['INFO']].split(';'):
            if i.startswith("DP="):
                dp = i.split('=')[1]
            if i.startswith("AF="):
                vf1 = float(i.split('=')[1])
        '''
        for i in row[fieldId['FORMAT']].split(':'):
            if str(i) == "DP":
                dp = int(row[fieldId[header[-1]]].split(':')[j])
            if str(i) == "AD":
                ad = str(row[fieldId[header[-1]]].split(':')[j])
                if str(',') in ad:
                    ref_count = int(ad.split(',')[0])
                    alt_count = int(ad.split(',')[1])
                    try:
                        vf = float(
                            float(alt_count) /
                            (float(ref_count) + float(alt_count)))
                    except:
                        vf = 0.0
                else:
                    abortWithMessage(
                        "Sample {0} may not have Haplotype Caller mutations with no ALT or vf"
                        .format(header[-1]))
            j += 1
        try:
            vf
        except:
            print(row, file=sys.stderr)
            vf = 0.0
        try:
            dp
        except:
            print(row, file=sys.stderr)
            dp = 0.0
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
Exemple #17
0
    def parse_MiSeq(self):
        ''' MiSeq vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        fc = self.fc
        effect = self.eff
        for i in row[fieldId['INFO']].split(';'):
            if i.startswith("DP="):
                dp = i.split('=')[1]

            # if the MiSeq software reported functional consequence and effect and the file is not snpEff anotated, the MiSeq annotations will be used instead
            if i.startswith("FC=") and not fc:
                for j in i.split('=')[1].split(','):
                    if str(j.split('_')[0]) not in str(fc):
                        fc += str(j.split('_')[0]) + ";"
                    try:
                        if str(j.split('_')[1]) not in str(effect):
                            effect += str(j.split('_')[1]) + ";"
                    except:
                        pass
            elif str(i) == "EXON":
                fc += 'EXON'
        if not fc:
            fc = str("?")
        if not effect:
            effect = str("?")
        k = 0
        for i in row[fieldId['FORMAT']].split(':'):
            if str(i) == "VF":
                vf = float(row[fieldId[header[-1]]].split(':')[k])
            '''
            #for when vf is not in the format column, but AD is
            if str(i) == "AD" and not dp or not vf:
                dp = 0
                rd = int(row[fieldId[header[-1]]].split(':')[k].split(',')[0])
                ad = int(row[fieldId[header[-1]]].split(':')[k].split(',')[1])
                dp = int(rd) + int(ad)
            '''
            k += 1

        position = int(row[fieldId['POS']])
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
Exemple #18
0
def main():
    global options, args
    # **********************
    # store in DBNLVar
    # **********************

    # Define connection parameters andd perform connection:
    connection = httplib2.Http(".cache")
    connection.add_credentials('*****@*****.**',
                               'prueba')

    # Open annotation file and parse each line in it
    annotation_vcf = vcf.Reader(open(options.input_vcf, 'r'))

    # Load metadata in variant object
    variant = Variant(samples=annotation_vcf.samples)

    for record in annotation_vcf:
        # Load variant information in DBNLVar, from consequences
        variant.get_from_record(record=record)
        for consequence in variant.consequences:
            resp = load_consequence(consequence=consequence)

        quit()
        # Store consequence non-relating data in DBNLVar
        if not check_record(table='chromosome',
                            value=chrom_to_number(record.CHROM)):
            #payload = {'id': chrom_to_number(record.CHROM), 'name': number_to_chrom(chrom_to_number(record.CHROM))}
            load_record(payload=record)

        # Store consequence relating data in DBNLVar
        for consequence in record.INFO['CSQ']:
            for index, annotation in enumerate(consequence.split(separator)):
                payload = {}

        resp = requests.get(uri + 'chromosome/id/3/24.json', auth=auth)
        print resp.json()
        if resp.status_code == 200:
            content = resp.json()['content']
        else:
            print "ERROR: Problem in query"
            raise
        print content
Exemple #19
0
def test_pc_iter_3():
    seq = 'AAAAAAAAAAA'
    #      01234567890
    #          CGT

    variants = [
        Variant('t', 't', 4, 'A', 'C', 0.25),
        Variant('t', 't', 5, 'A', 'G', 0.25),
        Variant('t', 't', 6, 'A', 'T', 0.25)
    ]
    it = PseudocontigIterator(seq, variants, 4)
    pc = it.next()
    pcs = []
    while pc:
        pcs.append(pc)
        pc = it.next()
    assert 'AAACAAA' in pcs
    assert 'AACGAA' in pcs
    assert 'ACATA' in pcs
    assert 'ACGTA' in pcs
Exemple #20
0
def main():
    global options, args
    # **********************
    # store in DBNLVar
    # **********************

    # Define connection parameters andd perform connection:
    connection = httplib2.Http(".cache")
    connection.add_credentials("*****@*****.**", "prueba")

    # Open annotation file and parse each line in it
    annotation_vcf = vcf.Reader(open(options.input_vcf, "r"))

    # Load metadata in variant object
    variant = Variant(samples=annotation_vcf.samples)

    for record in annotation_vcf:
        # Load variant information in DBNLVar, from consequences
        variant.get_from_record(record=record)
        for consequence in variant.consequences:
            resp = load_consequence(consequence=consequence)

        quit()
        # Store consequence non-relating data in DBNLVar
        if not check_record(table="chromosome", value=chrom_to_number(record.CHROM)):
            # payload = {'id': chrom_to_number(record.CHROM), 'name': number_to_chrom(chrom_to_number(record.CHROM))}
            load_record(payload=record)

        # Store consequence relating data in DBNLVar
        for consequence in record.INFO["CSQ"]:
            for index, annotation in enumerate(consequence.split(separator)):
                payload = {}

        resp = requests.get(uri + "chromosome/id/3/24.json", auth=auth)
        print resp.json()
        if resp.status_code == 200:
            content = resp.json()["content"]
        else:
            print "ERROR: Problem in query"
            raise
        print content
Exemple #21
0
def variant_from_index_list(idx_list, line):
    # Inputs: list of indexes of the line
    #   One line of the input file as a list
    chrom = line[idx_list[0]]
    start = line[idx_list[1]]
    end = line[idx_list[2]]
    ref = line[idx_list[3]]
    alt = line[idx_list[4]]
    gene = line[idx_list[5]]
    var_type = line[idx_list[6]].replace(" ", "_")
    var_type = var_type.strip()
    return Variant(chrom, start, end, ref, alt, gene, var_type, None)
 def __setVariant__(self, row):
     # Reads row of variant file into dict by id and chromosome
     pid = row[self.vhead["Patient"]]
     c = self.__setChromosome__(row[self.vhead["Chr"]])
     start = row[self.vhead["Start"]]
     end = row[self.vhead["End"]]
     name = row[self.vhead["Name"]]
     if pid not in self.variants.keys():
         self.variants[pid] = {}
     if c not in self.variants[pid].keys():
         self.variants[pid][c] = []
     self.variants[pid][c].append(Variant(pid, c, start, end, row, name))
Exemple #23
0
def main():
    global options, args
    separator = '|'

    # Parse the HGVS name into genomic coordinates and alleles.
    #chrom, offset, ref, alt = hgvs.parse_hgvs_name('ENST00000515609.1:c.30G>T', genome, get_transcript=get_transcript)
    #print chrom, offset, ref, alt

    # Format an HGVS name.
    chrom, offset, ref, alt = ('chr2', 179616770, 'GAA', 'G')
    transcript = get_transcript('ENST00000359218.5')
    hgvs_name = hgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript)
    print hgvs_name
    chrom, offset, ref, alt = ('chr2', 179616770, 'GAA', 'GA')
    transcript = get_transcript('ENST00000359218.5')
    hgvs_name = hgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript)
    hgvs_var = hgvs.HGVSName(hgvs_name)
    hgvs_str = 'ENST00000359218.5:c.10597+1079_10597+1080delTTinsT'
    hgvs_var2 = hgvs.HGVSName(hgvs_str)

    print hgvs_name
    quit()

    # Open and parse each line of the vcf file
    input_vcf = vcf.Reader(open(options.input_vcf, 'r'))
    variant = Variant(samples=input_vcf.samples)

    # Open output file
    with open(options.output_vcf, 'w') as output_psv:
        # Generate output file header
        #variant = ConsequenceType(input_vcf.samples)
        output_psv.write(variant.create_psv_header(separator=separator))

        # Now parse lines in .vcf and output with new format:
        for record in input_vcf:
            # Only output sites that hasn't been filtered out
            if len(record.FILTER) == 0:
                #for consequence in range(0, len(record.INFO['CSQ'])):
                variant.get_from_record(record=record)
                output_psv.write(variant.put_to_psv(separator=separator))
Exemple #24
0
    def create_variant (self, value, scope, datatype=None):
        """Creates a `Variant` of this topic name with the specified
        string `value` and `scope`.

        If `datatype` is None, the newly created `Variant` will have
        the datatype xsd:string.

        The newly created `Variant` will contain all themes from the
        parent name and the themes specified in `scope`.

        :param value: the string value or locator which represents an IRI
        :type value: string or `Locator`
        :param scope: list of themes
        :type scope: list of `Topic`s
        :rtype: `Variant`

        """
        if value is None:
            raise ModelConstraintException(self, 'The value may not be None')
        if not scope:
            raise ModelConstraintException(self, 'The scope may not be None')
        if type(scope) not in (type([]), type(())):
            scope = [scope]
        if scope == list(self.get_scope()):
            raise ModelConstraintException(
                self, 'The variant would be in the same scope as the parent')
        if datatype is None:
            if isinstance(value, Locator):
                datatype = Locator(XSD_ANY_URI)
            elif isinstance(value, str):
                datatype = Locator(XSD_STRING)
        if isinstance(value, Locator):
            value = value.to_external_form()
        variant = Variant(name=self, datatype=datatype.to_external_form(),
                          value=value, topic_map=self.topic_map)
        variant.save()
        for theme in scope:
            variant.scope.add(theme)
        return variant
Exemple #25
0
def test_pc_iter_4():
    seq = 'AAANAAAAA'
    #      012345678
    #          T

    variants = [Variant('t', 't', 4, 'A', 'T', 0.25)]
    it = PseudocontigIterator(seq, variants, 4)
    pc = it.next()
    pcs = []
    while pc:
        pcs.append(pc)
        pc = it.next()
    assert 0 == len(pcs)
Exemple #26
0
def test_pc_iter_deletion_2():
    seq = 'AAAAAAAAA'
    #      012345678
    #         xxx

    variants = [Variant('t', 't', 3, 'AAA', [''], 0.25)]
    it = PseudocontigIterator(seq, variants, 4)
    pc = it.next()
    pcs = []
    while pc:
        pcs.append(pc)
        pc = it.next()
    assert 1 == len(pcs)
    assert 'AAAAAA' == pcs[0]
Exemple #27
0
def find_nocov_variants(covlist,chrom='',caller='',min_cov=5):
    variants = []
    assert min(covlist[1:]) >= 0
    nocov = [i for i,v in enumerate(covlist) if v < min_cov]
    nocov.remove(0) # take off the -1 at index 0
    if len(covlist)-1 == len(nocov): return None # entire sequence has no coverage
    nocov_intervals = list(intervals(nocov))
    for iv in nocov_intervals:
        data = {'chrom':chrom,'caller':caller,'pos':iv[0], 'type': 'no_cov'}
        data['length'] = iv[1] - iv[0] + 1
        data['mean_cov'] = scipy.mean(covlist[iv[0]:(iv[1]+1)])
        variants.append(Variant.from_dict(data))
  
    return variants
Exemple #28
0
def parse_variants(line):
    # Parses variant information from annovar-annotated vcf-file.
    variants = []
    chromosome = line[0]
    start = line[1]
    end = line[2]
    ref = line[3]
    alt = line[4]
    info = line[7]
    genes, type, af = parse_info(info.split(';'))
    for g in genes:
        new_var = Variant(chromosome, start, end, ref, alt, g, type, af, None)
        variants.append(new_var)
    return variants
Exemple #29
0
def test_pc_iter_insertion_2():
    seq = 'AAAAAAAAA'
    #      012345678
    #          ^
    #          TT

    variants = [Variant('t', 't', 4, '', ['TT'], 0.25)]
    it = PseudocontigIterator(seq, variants, 4)
    pc = it.next()
    pcs = []
    while pc:
        pcs.append(pc)
        pc = it.next()
    assert 1 == len(pcs)
    assert 'AATTAA' == pcs[0]
Exemple #30
0
    def uniqueVariants(self):
        '''Return the set of unique variants from the set of all variants (for this feature)'''
        # exploit the hashtable and uniqueness of sets to quickly find
        # unique tuples (contig, pos, ref, alt) of variant info
        # sorted by chrom, pos
        uniqueVariantsTemp = set()
        for var in self.variants:
            candidate = (var.pos.chrom, var.pos.pos, var.ref, var.alt)
            uniqueVariantsTemp.add(candidate)
        # sort by chr, then position
        # TO DO: python sorted() will sort as: chr1, chr10, chr2, chr20, chrX. Fix.
        uniqueVariantsTemp = sorted(uniqueVariantsTemp,
                                    key=lambda varx: (varx[0] + str(varx[1])))

        # Now construct a returnable set of Variant objects,
        # specifying multiple "sources" in the source field
        # this loop's inner-product is #unique variants * #total variants, times #features
        # and is a major inefficiency
        uniqueVariants = set()
        for uniqueVarTup in uniqueVariantsTemp:
            source = ""
            frac = ""
            dp = ""
            eff = ""
            fc = ""
            #annot = ""
            for varClass in self.variants:
                if (varClass.pos.chrom, varClass.pos.pos, varClass.ref,
                        varClass.alt) == uniqueVarTup:
                    source += varClass.source + ", "
                    frac += str(varClass.frac) + ", "
                    dp += str(varClass.dp) + ", "
                    eff += str(varClass.eff) + ", "
                    fc += str(varClass.fc) + ", "
                    #annot += str(varClass.annot) + ", "
            pos = HTSeq.GenomicPosition(uniqueVarTup[0], uniqueVarTup[1])
            uniqueVar = Variant(
                source.strip(", "),
                pos,
                ref=uniqueVarTup[2],
                alt=uniqueVarTup[3],
                frac=str(frac).strip(", "),
                dp=str(dp).strip(", "),
                eff=str(eff).strip(", "),
                fc=str(fc).strip(", "))  ######## Karl Modified ##############
            uniqueVariants.add(uniqueVar)

        return uniqueVariants
Exemple #31
0
    def parse_IonTorrent(self):
        ''' Ion Torrent vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        for i in row[fieldId['INFO']].split(';'):
            if i.startswith("AO="):
                tempval = i.split('=')[1]
            if i.startswith("RO="):
                ro = i.split('=')[1]
            if i.startswith("DP="):
                dp = i.split("=")[1]
        if str(',') in str(tempval):
            tempval2 = [
                int(numeric_string) for numeric_string in tempval.split(',')
            ]
            try:
                ao = sum(tempval2)
            except:
                abortWithMessage(
                    "AO should be an int, or a list of ints: AO = {0}/".format(
                        tempval2))
        else:
            ao = tempval
        vf = float(float(ao) / float(float(ro) + float(ao)))
        position = int(row[fieldId['POS']])
        for i in str(row[fieldId['ALT']]).split(','):
            if len(str(row[fieldId['REF']])) > len(i):
                # this is a deletion in Ion Torrent data
                position = int(row[fieldId['POS']])
                break
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
Exemple #32
0
    def parse_GenericGATK(self):
        ''' 
        Generic GATK parser function. This was written for the Illumina BaseSpace BWA Enrichment Workflow vcf files, but may apply to more filetypes
        Input: InputParser object. Output: Variant object 
        '''
        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        j = 0
        position = int(row[fieldId['POS']])
        for i in row[fieldId['FORMAT']].split(':'):
            if str(i) == "AD":
                ro = int(row[fieldId[header[-1]]].split(':')[j].split(',')[0])
                #ao = int(row[fieldId[header[-1]]].split(':')[j].split(',')[-1]) # fails when the mutation has two alternate alleles in the same VCF line
                ao = sum([
                    int(x) for x in row[fieldId[header[-1]]].split(':')
                    [j].split(',')[1:]
                ])
                dp = ro + ao
                try:
                    vf = float(
                        float(ao) / float(dp)
                    )  # one VF for all possible alternate alleles. Nothing unusual, unless the mutation has multiple alt alleles in 1 vcf line
                except:
                    print("\nwarning: no vaf?\n" + str(row) + "\n")
                    vf = 0
                break
            j += 1

        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
 def __setBlastResults__(self, name, infile):
     # Reads in infile as a dictionary stored by chromosome (each file is one sample)
     first = True
     with open(infile, "r") as f:
         for line in f:
             if first == True:
                 delim = getDelim(line)
                 first = False
             row = line.strip().split(delim)
             c = row[self.bhead["subjectid"]]
             pas = self.__evaluateRows__(row)
             if pas == True and c in self.variants[name].keys():
                 # Only proceed if there is sufficient match quality and chromosome is present in variants
                 qid = row[self.bhead["queryid"]]
                 start = row[self.bhead["sstart"]]
                 end = row[self.bhead["send"]]
                 if c not in self.results.keys():
                     self.results[c] = []
                 self.results[c].append(Variant(qid, c, start, end, row))
Exemple #34
0
 def get_product_variants(self, product):
     if not(isinstance(product, Product)):
         raise Exception('Expected product object')
     log('[{}.json] Getting product variants'.format(product.url), color='blue')
     endpoint = '{}.json'.format(product.url)
     r = self.S.get(
         endpoint,
         headers=self.headers,
         verify=False
     )
     try:
         r.raise_for_status()
     except requests.exceptions.HTTPError:
         log('[error][{}][{}.json] Failed to get variants'.format(r.status_code, product.url), color='red')
         return None
     with r.json() as json:
         variant_objects = []
         for var in json['variants']:
             variant_objects.append(Variant(var['id'], var['title']))
     return variant_objects
Exemple #35
0
def main():
    if len(sys.argv) < 3:
        print('usage:\nconvert <pipeline>.xml <output>.sh (arg_name=arg_value)*')
        return

    pl_file = sys.argv[1]
    script = sys.argv[2]
    args = {arg : Variant.from_string(value, 'string') for (arg, value) in [item.split('=') for item in sys.argv[3:]]}

    pm = PackageManager()
    include_sh = open('Test/diff_expr/include.sh', 'w')
    include_sh.write(pm.get_header())
    include_sh.close()

    pipeline = Pipeline(pl_file, pm)
    out_file = open(script, 'w')
    output = pipeline.generate(args)
    out_file.write('DIR="${BASH_SOURCE%/*}"\n. "$DIR/include.sh"\n\n')
    out_file.write(output)
    out_file.close()
    system('chmod +x ' + script)
    print(output)
Exemple #36
0
    def _process_option(self, node, args):
        '''Return Option, that contain values produced using args comed from step declaration'''
        opt = Option(node.attrib.get('repr'))
        name = node.attrib['name']

        if 'default' in node.attrib:
            opt.set_default_val(Variant.from_string(node.attrib['default'], node.attrib['type']))
        elif 'default_ref' in node.attrib:
            ref = node.attrib['default_ref'].strip()
            if ref in self._inputs:
                opt.set_default_val(self._inputs[ref].get())
            else:
                raise RuntimeError('Reference to currently undefined symbol: ' + ref)
        else:
            for child in node: #check by RELAXNG
                if child.tag == 'default':
                    opt.set_default_val(self._eval_expression(child))

        if name in args:
            opt.set_val(args[name])
        
        return opt
Exemple #37
0
def main():
    global options, args
    separator = '|'

    # Open and parse each line of the vcf file
    input_vcf = vcf.Reader(open(options.input_vcf, 'r'))
    if options.non_model:
        variant = Variant(samples=input_vcf.samples, organism_type='non_model', ploidy=options.ploidy)
    else:
        variant = Variant(samples=input_vcf.samples, ploidy=options.ploidy)

    # Open output file
    with open(options.output_vcf, 'w') as output_psv:
        # Generate output file header
        #variant = ConsequenceType(input_vcf.samples)
        output_psv.write(variant.create_psv_header(separator=separator))

        # Now parse lines in .vcf and output with new format:
        for record in input_vcf:
            # Only output sites that hasn't been filtered out
            if len(record.FILTER) == 0:
                #for consequence in range(0, len(record.INFO['CSQ'])):
                variant.get_from_record(record=record)
                output_psv.write(variant.put_to_psv(separator=separator))
Exemple #38
0
def join(mod_params, args):
    str_value =  (mod_params or '').join([arg.to_string() for arg in args])
    return Variant.from_string(str_value, 'string')
Exemple #39
0
def base_name(mod_params, args):
    assert len(args) == 1
    return Variant.from_string('.'.join(args[0].to_string().split('.')[:-1]))
Exemple #40
0
def to_list(mod_params, args):
	return Variant.from_variant_list(args)
Exemple #41
0
def find_variants(covlist, seq, chrom, min_cov=5, min_score=30, exclude_edges=False, exclude_overlaps=False):
    ''' identify coverage variants in covlist
        Returns dict with keys 'mean_cov','pct_cov', and 'variants', where dict['variants']
        is a list of Variant objects
    '''

    assert min(covlist[1:]) >= 0
    assert len(covlist) - 1 == len(seq), "Number of coverage values (%d) is not equal to sequence length (%d)" % (len(covlist)-1,len(seq))
    retval = {}
    nocov = [i for i,v in enumerate(covlist) if v < min_cov]
    nocov.remove(0)
    
    retval['mean_cov'] = scipy.mean(covlist[1:])  
    retval['pct_cov'] = 1 - (float(len(nocov)) / (len(covlist) - 1))

    if len(nocov) == len(seq):
        return retval
    
    nocov_intervals = list(intervals(nocov))
    #covscores,localmeans = local_coverage_score(covlist)
    covscores,localmeans = adjusted_coverage_score(covlist)

    covdip = [i for i,v in enumerate(covscores) if v >= min_score]
    covdip_intervals = list(intervals(covdip))
  
    # refine intervals
    if exclude_edges:
        # ignore intervals that overlap the beginning and end of reference
        covdip_intervals = [iv for iv in covdip_intervals if not iv[0]==1 and not iv[1]==(len(covlist)-1)]
  
    if exclude_overlaps:
        # ignore covdip intervals that overlap with nocov intervals
        covdip_intervals = remove_overlap(covdip_intervals,nocov_intervals)
        # covdip = list(itertools.chain(*[range(v1,v2+1) for v1,v2 in covdip_intervals])
  
    # positions with no coverage are not considered to be coverage dips 
    covdip = [p for p in covdip if p not in nocov]
  
    variants = []
    for iv in nocov_intervals:
        data = {'chrom':chrom, 'pos':iv[0], 'type': 'no_cov'}
        data['length'] = iv[1] - iv[0] + 1
        data['mean_cov'] = scipy.mean(covlist[iv[0]:(iv[1]+1)])
        variants.append(Variant.from_dict(data))
  
    for iv in covdip_intervals:
        data = {'chrom':chrom, 'pos':iv[0], 'type': 'cov_dip'}
        data['length'] = iv[1] - iv[0] + 1
        data['mean_cov'] = scipy.mean(covlist[iv[0]:(iv[1]+1)])
        intscores = covscores[iv[0]:(iv[1]+1)]
        intmeans  = localmeans[iv[0]:(iv[1]+1)]
        data['quality'] = max(intscores)
        data['info'] = {'CovScores':'%s' % ','.join(['%d' % int(round(v)) for v in intscores]),
                        'LocalMeans':'%s' % ','.join(['%d' % int(round(v)) for v in intmeans]),
                        }
        data['ref'] = str(seq[iv[0]:(iv[1]+1)].seq).upper()
        # data['alt'] = data['ref'].lower()    
        variants.append(Variant.from_dict(data))
  
    if variants:
        retval['variants'] = variants
  
    return retval
Exemple #42
0
def script(res1, res2, out, **kwargs):
    variants = Variant.load_res_file(res1)
    rand_variants = Variant.load_res_file(res2)
Exemple #43
0
#import matplotlib.pyplot as plt
from variant import Variant

"""Plot AF vs. silva score, given a list of variants"""
def plot_freq(variants):
    afs = [x.af for x in variants]
    scores = [x.score for x in variants]
    #plt.scatter(scores, afs)
    #plt.show()
    f = open('/dupa-filer/talf/silva-pipeline/test.out', 'w')
    for a,s in zip(afs, scores):
        f.write('\t'.join([str(a),str(s)]) + '\n')
    f.close()

if __name__ == '__main__':
    variants = Variant.load_res_file('/dupa-filer/talf/silva-pipeline/1000gp_rare_results.txt')
    plot_freq(variants)
  if not os.path.isdir(args.jobdir):   sys.exit('Error: directory "%s" does not exist' % args.pooldir)
  if not os.path.exists(args.reffile): sys.exit('Error: reference file "%s" does not exist' % args.reffile)  

  job_path       = os.path.abspath(args.jobdir)
  reference_file = os.path.abspath(args.reffile)

  # load sequences
  seqs = dict( [(s.id,s) for s in SeqIO.parse(reference_file,'fasta')] )

  summaries = dict(( (name,{'variants':[]}) for name in seqs.keys()))
  ''' GATK variants '''
  print >>sys.stderr, "[ Reading GATK variants ]"
  vlines = [l.strip('\n') for l in open('%s/GATK/snps.gatk.vcf' % job_path,'rU') if not l.startswith('#')]
  for l in vlines:
    v = Variant.from_vcf(l)
    v.caller = 'gatk'
    summaries[v.chrom]['variants'].append(v)

  ''' PacBio variants '''
  print >>sys.stderr, "[ Reading GenCons variants ]"
  glines = [l.strip('\n') for l in gzip.open('%s/data/variants.gff.gz' % job_path,'rb') if not l.startswith('#')]
  for l in glines:
    v = Variant.from_gff(l)
    v.caller = 'gencons'
    summaries[v.chrom]['variants'].append(v)

  ''' coverage variants '''
  print >>sys.stderr, "[ Reading coverage variants ]"
  covdata = parse_covdepth('%s/GATK/covdepth' % job_path)
  covvars = {}