def _stream2SSfile(self, RefSNPdict): """ Makes simple SNP file for analysis by SNPhylo tree maker ** - SNPhylo is case-sensitive - Also requires single genotype mutation, represent single alleles different from the ref only ### BASIC FORMAT: #chrNum position Ref SampleID1 ... 1 1000 A A T """ F = open(self.outputPath, 'w') F.write('#Chrom\tPos\tRef\tSampleID1\n') for chr in self.chrList: positions = RefSNPdict[chr].keys() positions.sort() for pos in positions: refGenotype = RefSNPdict[chr][pos][GENOTYPE].upper() if pos in self.genome[chr]: if len(self.genome[chr][refPos][GENOTYPE]) > 1: if refGenotype.lower() == self.genome[chr][refPos][GENOTYPE][0].lower(): sample = self.genome[chr][refPos][GENOTYPE][1] else: sample = self.genome[chr][refPos][GENOTYPE][0] else: sample = self.genome[chr][refPos][GENOTYPE] F.write('%s\t%s\t%s\t%s\n' % (chr, pos, refGenotype, sample)) F.close() parseNstream.printMsg("Completed streaming to Simple SNP file format (path=%s)" % self.outputPath)
def _stream2SRSIDfile(self): """ Streams input data containing SNP rsid's to an output (filename) file containing just the RSID values in each line. """ F = open(self.outputPath, 'w') for chr in self.genome: for pos in self.genome[chr]: F.write('%s\n' % self.genome[chr][pos][RSID]) F.close() parseNstream.printMsg("Completed streaming to simple RSID file format (path=%s)" % self.outputPath)
def __init__(self, chrom=""): if chrom == "": printMsg("Enter chromosome (1, 2, ..., or 23 (X), 24 (Y), 25 (MT))") chrom = raw_input('---> ') self.num = int(chrom) self.chrLengths = [248956422, 242193529, 198295559, 190214555, 181538259, \ 170805979, 159345973, 145138636, 138394717, 133797422, 135086622, \ 133275309, 114364328, 107043718, 101991189, 90338345, 83257441, \ 80373285, 58617616, 64444167, 46709983, 50818468, 156040895, 57227415, 16569] self.positions = [] self.minSNP = 0 self.maxSNP = 0
def addUserScoredData_toDB(file, userObjID='', streamStatus=''): """ Adds every single SNP inside the _scoredAllelesObject to the DB. _scoredAllelesObject queries the DB and outputs a dictionary containing all SNPs that don't already exist in the DB file: a file output from scoreAlleles(). userObjID: a user id from the users_collection.users "_id": { "$oid": "555a83687a349b6910bdff6c" } """ COUNT = 0 k = 0 if userObjID == '': userObjID = raw_input('Please enter a user ID: ') assert(userObjID != '') # Date/time stamp for start of run parseNstream.printMsg(datetime.datetime.now().ctime()) newNumSNPs = 0 if os.path.exists(file): # Implement assertion for file type, ensure all column headers are accurate ScoredSNPdict = parseNstream._scoredAllelesObject(file, userObjID) alreadyinDB = {} alreadyinDB['total'] = 0 alreadyinDB['ids'] = [] signal = 0 for chr in parseNstream.CHROMOSOME_LIST: alreadyinDB[chr] = 0 collect_signals = [] if any(chr == x for x in ['1', '2', '3', '4', '5']): signal += 1 collection = db_model.CollectbyChromosome_1to5 elif any(chr == x for x in ['6', '7', '8', '9', '10', '11']): signal += 1 collection = db_model.CollectbyChromosome_6to11 elif any(chr == x for x in ['12', '13', '14', '15', '16', '17']): signal += 1 collection = db_model.CollectbyChromosome_12to17 elif any(chr == x for x in ['18', '19', '20', '21', '22', '23', '24', '25']): signal += 1 collection = db_model.CollectbyChromosome_18to25 collect_signals.append(str(signal)) positions = ScoredSNPdict[chr].keys() positions.sort() # Iterates through each chromosomal position adding new RSIDS to DB in ascending order for position in positions: # if not UsersSNPCollection.find({'_id': rsid}).count(): data = ScoredSNPdict[chr][position] if newNumSNPs == 0: parseNstream.printMsg("Sample data object:") print data # Insert into DB if streamStatus == 'empty': collection.insert(data) newNumSNPs += 1 if streamStatus != 'empty': if not collection.find({'_id': data[RSID]}).count(): collection.insert(data) newNumSNPs += 1 else: alreadyinDB['total'] += 1 alreadyinDB[chr] += 1 alreadyinDB['ids'] += [data[RSID]] if newNumSNPs == db_model.CHECKLIST[k]: print "[x] Check %s-th complete. %s SNPs parsed so far. %s new SNPs added in total. %s parsed SNPs were already in the DB." % (k+1, db_model.CHECKLIST[k], newNumSNPs, COUNT-newNumSNPs) k += 1 if collect_signals == ['1', '2', '3', '4', '5']: parseNstream.printMsg("Interval 1 complete: (1-5)") elif collect_signals == ['6', '7', '8', '9', '10', '11']: parseNstream.printMsg("Interval 1 complete: (6-11)") elif collect_signals == ['12', '13', '14', '15', '16', '17']: parseNstream.printMsg("Interval 1 complete: (12-17)") else: if collect_signals == ['18', '19', '20', '21', '22', '23', '24', '25']: parseNstream.printMsg("Interval 1 complete: (18-25)") for chr in alreadyinDB: print alreadyinDB[chr] print "[x] Last check complete. %s total SNPs added to the UsersSNPCollection." % (db_model.CHECKLIST[k]) # Date/time stamp for end of run parseNstream.printMsg(datetime.datetime.now().ctime()) print alreadyinDB['ids'] parseNstream.printMsg('SNPS ALREADY IN DB: %s' % alreadyinDB['total']) for chr in alreadyinDB: print alreadyinDB[chr] else: print "Path <%s> not found" % file
... etc. """ # Make dict of all SNP positions. Keys = Chromosome #, Values = lst of positions for chrom in ChromosomeDict: ChromosomePositions[chrom] = ChromosomeDict[chrom].keys() ChromosomePositions[chrom].sort() # Could go without Chromosomes[chrom] = Chromosome(chrom=chrom) Chromosomes[chrom].positions = ChromosomePositions[chrom] ChromosomePositions[chrom] = np.array(ChromosomePositions[chrom],dtype=np.float64) # Output min and max values for chrom in ChromosomePositions: Chromosomes[chrom].minSNP = ChromosomePositions[chrom].min() Chromosomes[chrom].maxSNP = ChromosomePositions[chrom].max() printMsg("Chr%s: lowest chromosome position, highest chromosome position = (%s, %s)" % (chrom, Chromosomes[chrom].minSNP, Chromosomes[chrom].maxSNP)) ChromosomeSizes = [] for chrom in Chromosomes: ChromosomeSizes.append(Chromosomes[chrom].size()) print Chromosomes[chrom].size() ChromosomeSizes.sort() ChromosomeSizes.reverse() ChromosomeSizes = np.array(ChromosomeSizes) largest_size = ChromosomeSizes.max() smallest_size = ChromosomeSizes.min() largest_chrom = "" smallest_chrom = "" printMsg("Chromosomes By Size") index = 1
def concatenateSNPs(RefSNPFile, UserSNPFile): """ Structure of RefSNPdict and UserSNPdict: key = chromosome string as '1', '2', ..., '23' (X), '24' (Y), '25', (MT/M) (*) Reference allele is only ONE letter. - Genotype is a string of len=2, the original allele representations from the 23andme file - Variant(s) represents any letters different from the Reference, if none then given '-' - Match Score (0,1,2): the SNP is given a score of ... - 2 (homozygous) if both letters are the same as the Reference - 1 (heterozygous) if one letter is the same as the Reference - 0 (recessive) if neither match the reference (*) Variants given a match score of 0 may need to be switched to the opposite letters (A to T, C to G and vice versa); must check to confirm which are minus vs. plus strands) ### BASIC FORMAT: #chrNum position Ref Genotype Variant Matches(0,1,2) 1 1000 A AA - RETURNS: none, streams to file... """ RefSNPdict = parseNstream._referenceObject(RefSNPFile, 'CHROMOSOME') UserSNPdict = parseNstream._23andmeObject(UserSNPFile, 'CHROMOSOME') nonMatchedRSIDs = [] numMatchedRSIDs = 0 snpsConcatenated = 0 indel = {} F = open(DEFAULT_OUTPUT_FILEPATH, 'w') for chr in parseNstream.CHROMOSOME_LIST: positions = RefSNPdict[chr].keys() positions.sort() indel[chr] = {} for pos in positions: refAllele = RefSNPdict[chr][pos][GENOTYPE].upper() rsid = RefSNPdict[chr][pos][RSID] # Check allele cases for ref in the user's snps if pos in UserSNPdict[chr]: if rsid != UserSNPdict[chr][pos][RSID]: nonMatchedRSIDs += [(rsid, UserSNPdict[chr][pos][RSID])] if isIndel(UserSNPdict[chr][pos][GENOTYPE]): indel[chr][pos] = { RSID: UserSNPdict[chr][pos][RSID], GENOTYPE: UserSNPdict[chr][pos][GENOTYPE] } else: numMatchedRSIDs += 1 sampleAllele = UserSNPdict[chr][pos][GENOTYPE] # 3 # Chromosomes 1-22 will have an allele pair, hence str len of 2 if len(sampleAllele) == 2: if sampleAllele[0] == '-' and sampleAllele[1] == '-': variant = '-' score = '-' elif sampleAllele[0].lower() == refAllele.lower() and sampleAllele[1].lower() == refAllele.lower(): variant = '-' score = 2 elif sampleAllele[0].lower() == refAllele.lower(): variant = sampleAllele[1] score = 1 elif sampleAllele[1].lower() == refAllele.lower(): variant = sampleAllele[0] score = 1 else: variant = sampleAllele score = 0 # Mitochondria, X, and Y chromosome alleles are of length 1 elif len(sampleAllele) == 1: if sampleAllele == '-': variant = '-' score = '-' elif sampleAllele.lower() == refAllele.lower(): variant = '-' score = 1 else: variant = sampleAllele score = 0 if score != '-': snpsConcatenated += 1 if variant == '-': F.write('%s' % sampleAllele[0]) else: F.write('%s' % variant) F.close() parseNstream.printMsg("Completed scoring user SNPs to REF SNPs. Streamed to a .FASTA file (path=%s)" % DEFAULT_OUTPUT_FILEPATH) parseNstream.printMsg('Number of matched rsid values: %s' % numMatchedRSIDs) print "Only the RSID's of the user that existed in the reference data base were used. Also, INDELs were not concatenated." parseNstream.printMsg('Number of unmatched rsid values: %s' % len(nonMatchedRSIDs)) parseNstream.printMsg('Number of SNPs concatenated: %s' % snpsConcatenated)
def scoreAlleles(RefSNPFile, UserSNPFile): """ Structure of RefSNPdict and UserSNPdict: key = chromosome string as '1', '2', ..., '23' (X), '24' (Y), '25', (MT/M) (*) Reference allele is only ONE letter. - Genotype is a string of len=2, the original allele representations from the 23andme file - Variant(s) represents any letters different from the Reference, if none then given '-' - Match Score (0,1,2): the SNP is given a score of ... - 2 (homozygous) if both letters are the same as the Reference - 1 (heterozygous) if one letter is the same as the Reference - 0 (recessive) if neither match the reference (*) Variants given a match score of 0 may need to be switched to the opposite letters (A to T, C to G and vice versa); must check to confirm which are minus vs. plus strands) ### BASIC FORMAT: #chrNum position Ref Genotype Variant Matches(0,1,2) 1 1000 A AA - RETURNS: none, streams to file... """ RefSNPdict = parseNstream._referenceObject(RefSNPFile, 'CHROMOSOME') UserSNPdict = parseNstream._23andmeObject(UserSNPFile, 'CHROMOSOME') # Keep track of RSIDs in the that do not match in position and rsid value in tuples (ref rsid, user rsid) nonMatchedRSIDs = [] # Count number of RSIDs whose positions in a chr match for both ref and user, and report it at the end. numMatchedRSIDs = 0 # Keep track of indels indel = {} F = open(DEFAULT_OUTPUT_FILEPATH, 'w') F.write('#Chrom\tRSID\tPos\tRef\tGenotype\tVariant(s)\tMatch Score\n') for chr in parseNstream.CHROMOSOME_LIST: positions = RefSNPdict[chr].keys() positions.sort() indel[chr] = {} for pos in positions: refAllele = RefSNPdict[chr][pos][GENOTYPE].upper() rsid = RefSNPdict[chr][pos][RSID] # Check allele cases for ref in the user's snps if pos in UserSNPdict[chr]: if rsid != UserSNPdict[chr][pos][RSID]: nonMatchedRSIDs += [(rsid, UserSNPdict[chr][pos][RSID])] if isIndel(UserSNPdict[chr][pos][GENOTYPE]): indel[chr][pos] = { RSID: UserSNPdict[chr][pos][RSID], GENOTYPE: UserSNPdict[chr][pos][GENOTYPE] } else: numMatchedRSIDs += 1 sampleAllele = UserSNPdict[chr][pos][GENOTYPE] # 3 # Chromosomes 1-22 will have an allele pair, hence str len of 2 if len(sampleAllele) == 2: # No base call at the current rsid. [BLANK] if sampleAllele[0] == '-' and sampleAllele[1] == '-': variant = '-' score = '-' # Score 2, no variants. Both letters are homologous to the reference. [GRAY - default] elif sampleAllele[0].lower() == refAllele.lower() and sampleAllele[1].lower() == refAllele.lower(): variant = '-' score = 2 # Score 1, one variant. Second letter is homologous to the reference. [GREEN - success] elif sampleAllele[0].lower() == refAllele.lower(): variant = sampleAllele[1] score = 1 # Score 1, one variant. First letter is homologous to the reference. [GREEN - success] elif sampleAllele[1].lower() == refAllele.lower(): variant = sampleAllele[0] score = 1 # Score 0, two variants. No homology to the reference. [ORANGE - primary] else: variant = sampleAllele score = 0 # Mitochondria, X, and Y chromosome alleles are of length 1 elif len(sampleAllele) == 1: # No base call at the current rsid. [BLANK] if sampleAllele == '-': variant = '-' score = '-' # Score 1, no variants. Only letter is homologous to the reference. [GRAY - default] elif sampleAllele.lower() == refAllele.lower(): variant = '-' score = 1 else: variant = sampleAllele score = 0 F.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chr, rsid, pos, refAllele, sampleAllele, variant, score)) F.close() parseNstream.printMsg("Completed scoring user SNPs to REF SNPs. Streamed to a .teyden (LOL) file format (path=%s)" % DEFAULT_OUTPUT_FILEPATH) parseNstream.printMsg('Number of matched rsid values: %s' % numMatchedRSIDs) parseNstream.printMsg('Number of unmatched rsid values: %s' % len(nonMatchedRSIDs)) return indel