def convertTo(self, format=''): """ Takes a string indiciating file format and creates a file of that file format from the conversion of the original file format Returns none """ fileEndpoint = self._getEndpoint(format) self.outputPath = 'output/conversionTo_'+format+fileEndpoint if format.upper() == 'VCF': self._stream2VCFfile() elif format.upper() == 'BED': self._stream2BEDfile() elif format.upper() == 'SS': # Input and output of files should also be via the DB ref_snps = parseNstream._referenceObject(self.referencePath, 'CHROMOSOME') ## parseNstream functions to be added as methods to a Genome object self._stream2SSfile(ref_snps) elif format.upper() == 'RSID': self._stream2SRSIDfile()
def concatenateSNPs(RefSNPFile, UserSNPFile): """ Structure of RefSNPdict and UserSNPdict: key = chromosome string as '1', '2', ..., '23' (X), '24' (Y), '25', (MT/M) (*) Reference allele is only ONE letter. - Genotype is a string of len=2, the original allele representations from the 23andme file - Variant(s) represents any letters different from the Reference, if none then given '-' - Match Score (0,1,2): the SNP is given a score of ... - 2 (homozygous) if both letters are the same as the Reference - 1 (heterozygous) if one letter is the same as the Reference - 0 (recessive) if neither match the reference (*) Variants given a match score of 0 may need to be switched to the opposite letters (A to T, C to G and vice versa); must check to confirm which are minus vs. plus strands) ### BASIC FORMAT: #chrNum position Ref Genotype Variant Matches(0,1,2) 1 1000 A AA - RETURNS: none, streams to file... """ RefSNPdict = parseNstream._referenceObject(RefSNPFile, 'CHROMOSOME') UserSNPdict = parseNstream._23andmeObject(UserSNPFile, 'CHROMOSOME') nonMatchedRSIDs = [] numMatchedRSIDs = 0 snpsConcatenated = 0 indel = {} F = open(DEFAULT_OUTPUT_FILEPATH, 'w') for chr in parseNstream.CHROMOSOME_LIST: positions = RefSNPdict[chr].keys() positions.sort() indel[chr] = {} for pos in positions: refAllele = RefSNPdict[chr][pos][GENOTYPE].upper() rsid = RefSNPdict[chr][pos][RSID] # Check allele cases for ref in the user's snps if pos in UserSNPdict[chr]: if rsid != UserSNPdict[chr][pos][RSID]: nonMatchedRSIDs += [(rsid, UserSNPdict[chr][pos][RSID])] if isIndel(UserSNPdict[chr][pos][GENOTYPE]): indel[chr][pos] = { RSID: UserSNPdict[chr][pos][RSID], GENOTYPE: UserSNPdict[chr][pos][GENOTYPE] } else: numMatchedRSIDs += 1 sampleAllele = UserSNPdict[chr][pos][GENOTYPE] # 3 # Chromosomes 1-22 will have an allele pair, hence str len of 2 if len(sampleAllele) == 2: if sampleAllele[0] == '-' and sampleAllele[1] == '-': variant = '-' score = '-' elif sampleAllele[0].lower() == refAllele.lower() and sampleAllele[1].lower() == refAllele.lower(): variant = '-' score = 2 elif sampleAllele[0].lower() == refAllele.lower(): variant = sampleAllele[1] score = 1 elif sampleAllele[1].lower() == refAllele.lower(): variant = sampleAllele[0] score = 1 else: variant = sampleAllele score = 0 # Mitochondria, X, and Y chromosome alleles are of length 1 elif len(sampleAllele) == 1: if sampleAllele == '-': variant = '-' score = '-' elif sampleAllele.lower() == refAllele.lower(): variant = '-' score = 1 else: variant = sampleAllele score = 0 if score != '-': snpsConcatenated += 1 if variant == '-': F.write('%s' % sampleAllele[0]) else: F.write('%s' % variant) F.close() parseNstream.printMsg("Completed scoring user SNPs to REF SNPs. Streamed to a .FASTA file (path=%s)" % DEFAULT_OUTPUT_FILEPATH) parseNstream.printMsg('Number of matched rsid values: %s' % numMatchedRSIDs) print "Only the RSID's of the user that existed in the reference data base were used. Also, INDELs were not concatenated." parseNstream.printMsg('Number of unmatched rsid values: %s' % len(nonMatchedRSIDs)) parseNstream.printMsg('Number of SNPs concatenated: %s' % snpsConcatenated)
def scoreAlleles(RefSNPFile, UserSNPFile): """ Structure of RefSNPdict and UserSNPdict: key = chromosome string as '1', '2', ..., '23' (X), '24' (Y), '25', (MT/M) (*) Reference allele is only ONE letter. - Genotype is a string of len=2, the original allele representations from the 23andme file - Variant(s) represents any letters different from the Reference, if none then given '-' - Match Score (0,1,2): the SNP is given a score of ... - 2 (homozygous) if both letters are the same as the Reference - 1 (heterozygous) if one letter is the same as the Reference - 0 (recessive) if neither match the reference (*) Variants given a match score of 0 may need to be switched to the opposite letters (A to T, C to G and vice versa); must check to confirm which are minus vs. plus strands) ### BASIC FORMAT: #chrNum position Ref Genotype Variant Matches(0,1,2) 1 1000 A AA - RETURNS: none, streams to file... """ RefSNPdict = parseNstream._referenceObject(RefSNPFile, 'CHROMOSOME') UserSNPdict = parseNstream._23andmeObject(UserSNPFile, 'CHROMOSOME') # Keep track of RSIDs in the that do not match in position and rsid value in tuples (ref rsid, user rsid) nonMatchedRSIDs = [] # Count number of RSIDs whose positions in a chr match for both ref and user, and report it at the end. numMatchedRSIDs = 0 # Keep track of indels indel = {} F = open(DEFAULT_OUTPUT_FILEPATH, 'w') F.write('#Chrom\tRSID\tPos\tRef\tGenotype\tVariant(s)\tMatch Score\n') for chr in parseNstream.CHROMOSOME_LIST: positions = RefSNPdict[chr].keys() positions.sort() indel[chr] = {} for pos in positions: refAllele = RefSNPdict[chr][pos][GENOTYPE].upper() rsid = RefSNPdict[chr][pos][RSID] # Check allele cases for ref in the user's snps if pos in UserSNPdict[chr]: if rsid != UserSNPdict[chr][pos][RSID]: nonMatchedRSIDs += [(rsid, UserSNPdict[chr][pos][RSID])] if isIndel(UserSNPdict[chr][pos][GENOTYPE]): indel[chr][pos] = { RSID: UserSNPdict[chr][pos][RSID], GENOTYPE: UserSNPdict[chr][pos][GENOTYPE] } else: numMatchedRSIDs += 1 sampleAllele = UserSNPdict[chr][pos][GENOTYPE] # 3 # Chromosomes 1-22 will have an allele pair, hence str len of 2 if len(sampleAllele) == 2: # No base call at the current rsid. [BLANK] if sampleAllele[0] == '-' and sampleAllele[1] == '-': variant = '-' score = '-' # Score 2, no variants. Both letters are homologous to the reference. [GRAY - default] elif sampleAllele[0].lower() == refAllele.lower() and sampleAllele[1].lower() == refAllele.lower(): variant = '-' score = 2 # Score 1, one variant. Second letter is homologous to the reference. [GREEN - success] elif sampleAllele[0].lower() == refAllele.lower(): variant = sampleAllele[1] score = 1 # Score 1, one variant. First letter is homologous to the reference. [GREEN - success] elif sampleAllele[1].lower() == refAllele.lower(): variant = sampleAllele[0] score = 1 # Score 0, two variants. No homology to the reference. [ORANGE - primary] else: variant = sampleAllele score = 0 # Mitochondria, X, and Y chromosome alleles are of length 1 elif len(sampleAllele) == 1: # No base call at the current rsid. [BLANK] if sampleAllele == '-': variant = '-' score = '-' # Score 1, no variants. Only letter is homologous to the reference. [GRAY - default] elif sampleAllele.lower() == refAllele.lower(): variant = '-' score = 1 else: variant = sampleAllele score = 0 F.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chr, rsid, pos, refAllele, sampleAllele, variant, score)) F.close() parseNstream.printMsg("Completed scoring user SNPs to REF SNPs. Streamed to a .teyden (LOL) file format (path=%s)" % DEFAULT_OUTPUT_FILEPATH) parseNstream.printMsg('Number of matched rsid values: %s' % numMatchedRSIDs) parseNstream.printMsg('Number of unmatched rsid values: %s' % len(nonMatchedRSIDs)) return indel