def _get_curie_and_type_from_id(variant_id): """ Given a variant id, our best guess at its curie and type (snp, haplotype, etc) 'None' will be used for both curie and type for IDs that we can't process # 2019-May three snp-id have ' e' or ' a' appended. note space. # examples: 'rs2440154 e-A' and 'rs2440154 e' # including the suffix in the url is a web noop but breaks rdflib :param variant_id: :return: """ curie = None variant_type = None # remove space before hyphens variant_id = re.sub(r' -', '-', variant_id).strip() if re.search(r' x ', variant_id) or re.search(r',', variant_id): # TODO deal with rs1234 x rs234... (haplotypes?) LOG.warning("Cannot parse variant groups of this format: %s", variant_id) elif re.search(r';', variant_id): curie = ':haplotype_' + Source.hash_id( variant_id) # deliberate 404 variant_type = "haplotype" elif variant_id[:2] == 'rs': # remove whitespace from errant id, rs6194 5053-? curie = 'dbSNP:' + variant_id.split('-')[0].replace(' ', '') # curie = re.sub(r'-.*$', '', curie).strip() variant_type = "snp" # remove the alteration elif variant_id[:3] == 'kgp': # http://www.1000genomes.org/faq/what-are-kgp-identifiers curie = 'GWAS:' + variant_id.split('-')[0] variant_type = "snp" elif variant_id[:3] == 'chr': # like: chr10:106180121-G variant_id = re.sub(r'-?', '-N', variant_id) variant_id = re.sub(r' ', '', variant_id) # going to hate myself but ... # moving this from a broken base node to yet another blank node # It had produced this monstrocity with the embedded quote # :gwas--Nc-Nh-Nr-N1-N1-N--N1-N0-N2-N7-N5-N1-N1-N0-N2-N"-N?-N curie = Source.make_id('gwas-' + re.sub(r':', '-', variant_id), '_') variant_type = "snp" elif variant_id.strip() == '': pass else: LOG.warning("There's a snp id i can't manage: %s", variant_id) return curie, variant_type
def _get_curie_and_type_from_id(variant_id): """ Given a variant id, our best guess at its curie and type (snp, haplotype, etc) None will be used for both curie and type for IDs that we can't process :param variant_id: :return: """ curie = None variant_type = None # remove space before hyphens variant_id = re.sub(r' -', '-', variant_id) if re.search(r' x ', variant_id) \ or re.search(r',', variant_id): # TODO deal with rs1234 x rs234... (haplotypes?) logger.warning( "Cannot parse variant groups of this format: %s", variant_id) elif re.search(r';', variant_id): curie = ':haplotype_' + Source.hash_id(variant_id) variant_type = "haplotype" elif re.match(r'rs', variant_id): curie = 'dbSNP:' + variant_id.strip() curie = re.sub(r'-.*$', '', curie).strip() variant_type = "snp" # remove the alteration elif re.match(r'kgp', variant_id): # http://www.1000genomes.org/faq/what-are-kgp-identifiers curie = ':kgp-' + variant_id.strip() variant_type = "snp" elif re.match(r'chr', variant_id): # like: chr10:106180121-G # variant_id = re.sub(r'-?', '-N', variant_id) variant_id = re.sub(r' ', '', variant_id) curie = ':gwas-' + re.sub( r':', '-', variant_id.strip()) variant_type = "snp" elif variant_id.strip() == '': pass else: logger.warning( "There's a snp id i can't manage: %s", variant_id) return curie, variant_type