Ejemplo n.º 1
0
    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

            line_id a CL_0000057,  #fibroblast line
                derives_from patient_id
                part_of :NIGMSrepository
                RO:model_of OMIM:disease_id

            patient id a foaf:person,
                label: "fibroblast from patient 12345 with disease X"
                member_of family_id  #what is the right thing here?
                SIO:race EFO:caucasian  #subclass of EFO:0001799
                in_taxon NCBITaxon:9606
                dc:description Literal(remark)
                RO:has_phenotype OMIM:disease_id
                GENO:has_genotype genotype_id

            family_id a owl:NamedIndividual
                foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

            genotype_id a intrinsic_genotype
                GENO:has_alternate_part allelic_variant_id
                we don't necessarily know much about the genotype,
                other than the allelic variant. also there's the sex here

            pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:
        """
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:'+catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_person'
                    if self.nobnodes:
                        patient_id = ':'+patient_id
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    gu.addIndividualToGraph(
                        g, cell_line_id, line_label, cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:'+dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        gu.addIndividualToGraph(
                            g, equiv_cell_line, None, cell_line_reagent_id)
                        gu.addSameIndividual(g, cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    gu.addMember(g, repository, cell_line_id)

                    if cat_remark != '':
                        gu.addDescription(g, cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                        # this would give a BNode that is an instance of Age.
                        # but i don't know how to connect
                        # the age node to the cell line? we need to ask @mbrush
                        # age_id = '_'+re.sub('\s+','_',age)
                        # gu.addIndividualToGraph(
                        #   g,age_id,age,self.terms['age'])
                        # gu.addTriple(
                        #   g,age_id,self.properties['has_measurement'],age,
                        #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    gu.addPerson(g, patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        gu.addSubclass(
                    #           g,self.terms['ethnic_group'],mapped_race)

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:'+family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        gu.addIndividualToGraph(
                            g, family_comp_id, family_label,
                            geno.genoparts['family'])

                        # Add the patient as a member of the family
                        gu.addMemberOf(g, patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_'+re.sub('MONARCH:', '', self.make_id(karyotype))
                        if self.nobnodes:
                            karyotype_id = ':'+karyotype_id
                        # add karyotype as karyotype_variation_complement
                        gu.addIndividualToGraph(
                            g, karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(
                                karyotype_feature_id, karyotype_feature_label,
                                geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph(g)
                            f.loadAllProperties(g)
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene+'('+mutation+')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = '_' + variant_id.replace(';', '-') + '-' \
                                    + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    if gvc_id is not None and gvc_id != karyotype_id \
                            and self.nobnodes:
                        gvc_id = ':'+gvc_id

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            if self.nobnodes:
                                vslc_id = ':'+vslc_id
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            gu.addIndividualToGraph(
                                g, vslc_id, vslc_label,
                                geno.genoparts[
                                    'variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:'+o+'.'+v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'],
                                    geno.object_properties[
                                        'has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        gu.addType(g, patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_geno'+catalog_id.strip()
                        if self.nobnodes:
                            genotype_id = ':'+genotype_id

                    # add the gvc
                    if gvc_id is not None:
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id,
                                    geno.object_properties[
                                        'has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' ['+catalog_id.strip()+']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(
                            genotype_id, genotype_label,
                            geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        gu.addTriple(
                            g, patient_id,
                            geno.properties['has_genotype'], genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:'+d.strip()
                                        # assume the label is taken care of
                                        gu.addClassToGraph(g, disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            self.name, patient_id, disease_id)
                                        assoc.add_association_to_graph(g)

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        gu.addTriple(
                                            g, cell_line_id,
                                            gu.properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:'+s.strip()
                            ref = Reference(pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph(g)
                            gu.addTriple(
                                g, pubmed_id, gu.properties['mentions'],
                                cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        return
Ejemplo n.º 2
0
    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:
        """
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        gu.loadAllProperties(g)
        f = Feature(None, None, None)
        f.loadAllProperties(g)

        gu.loadAllProperties(g)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:'+tax_num
        tax_label = 'Human'
        gu.addClassToGraph(g, tax_id, None)
        geno.addGenome(tax_id, None)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.  if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 28
                if num_cols != expected_numcols:
                    logger.error("Unexpected number of columns in raw file (%d actual vs %d expected)",
                                 num_cols, expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance,
                 dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin,
                 assembly, chr, start, stop, cytogenetic_loc,
                 review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval,
                 guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories) = line.split('\t')

                # #### set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #            or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub('^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub('[;,]$', '', phenotype_ids)
                    pheno_list = re.split('[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids and these phenotype_ids
                    intersect = list(set([str(i) for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and int(variant_num) not in self.variant_ids \
                            and len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the approximate location
                        # strangely, they still put an assembly number even when there's no numeric location
                        if not re.search('-',str(cytogenetic_loc)):
                            band_id = makeChromID(re.split('-',str(cytogenetic_loc)), tax_num, 'CHR')
                            geno.addChromosomeInstance(cytogenetic_loc, build_id, assembly, band_id)
                            bandinbuild_id = makeChromID(re.split('-',str(cytogenetic_loc)), assembly, 'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph, and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(str(chr), build_id, assembly, chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':  # they use -1 to indicate unknown gene
                    gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing the xml
                # for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype in the csv data
                # so the dbsnp or dbvar should probably be primary, and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # todo clinical significance needs to be mapped to a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph(g)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(g, bandinbuild_id)

                # CHECK - this makes the assumption that there is only one affected chromosome per variant
                # what happens with chromosomal rearrangement variants?  shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    gu.addSynonym(g, seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    gu.addSynonym(g, seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
                    gu.addIndividualToGraph(g, dbsnp_id, None)
                    gu.addSameIndividual(g, seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:'+dbvar_num
                    gu.addIndividualToGraph(g, dbvar_id, None)
                    gu.addSameIndividual(g, seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(';',rcv_nums):
                        rcv_id = 'ClinVar:'+rcv_num
                        gu.addIndividualToGraph(g, rcv_id, None)
                        gu.addXref(g, seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    gu.addClassToGraph(g, gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_'+gene_num+'-'+variant_num
                    if self.nobnodes:
                        vl_id = ':'+vl_id
                    vl_label = allele_name
                    gu.addIndividualToGraph(g, vl_id, vl_label, geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search('\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info("Gene found in allele label, but no id provided: %s", gmatch.group(1))
                    elif re.match('more than 10', gene_symbol):
                        logger.info("More than 10 genes found; need to process XML to fetch (variant=%d)", int(variant_num))
                    else:
                        logger.info("No gene listed for variant %d", int(variant_num))

                # parse the list of "phenotypes" which are diseases.  add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited, but i don't know why!
                # some are bad, like: Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for p in pheno_list:
                        m = re.match("(Orphanet:ORPHA(?:\s*ORPHA)?)", p)
                        if m is not None and len(m.groups()) > 0:
                            p = re.sub(m.group(1), 'Orphanet:', p.strip())
                        elif re.match('SNOMED CT', p):
                            p = re.sub('SNOMED CT', 'SNOMED', p.strip())

                        assoc = G2PAssoc(self.name, seqalt_id, p.strip())
                        assoc.add_association_to_graph(g)

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids"
                    # ex: CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:'+xrefid.split(':')[1]
                            gu.addIndividualToGraph(g, xrefid, None)
                            gu.addSameIndividual(g, seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            gu.addIndividualToGraph(g, xrefid, None)
                            gu.addSameIndividual(g, seqalt_id, xrefid)
                        elif prefix == 'dbVar' and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search('\s', prefix):
                            pass
                            # logger.debug('xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP)
        gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP)

        logger.info("Finished parsing variants")

        return