Esempio n. 1
0
    def create_file(self):
        '''
        Loops through the snp files and creates a table where each row is a person
        and each column is a person. The cells contain the number of differences found
        between them.
        
        Future work - change the number so that it is a measure of similarity instead
        of a measure of difference.  
        '''

        headerFields = [FIELD_PERSONID]

        srcFileNames = os.listdir(self.inputDirectory)
        for srcFileName in srcFileNames:
            srcFile = vcffile.VcfFile(srcFileName)
            personId = srcFile.get_person_id()
            headerFields.append(personId)

        with open(self.filename, 'w') as destFile:
            writer = csv.DictWriter(destFile,
                                    fieldnames=headerFields,
                                    lineterminator='\n')
            writer.writeheader()
            countOfSrcFiles = 0
            for srcFileName in srcFileNames:
                print srcFileName
                srcFile = vcffile.VcfFile(self.inputDirectory + srcFileName)
                personId = srcFile.get_person_id()
                rowOut = {FIELD_PERSONID: personId}
                snpsAndAlleles = srcFile.get_all_snps_and_alleles()
                snpsAndAlleles = sorted(snpsAndAlleles)
                for compareFileName in srcFileNames:
                    compareFile = vcffile.VcfFile(self.inputDirectory +
                                                  compareFileName)
                    comparePerson = compareFile.get_person_id()
                    if (personId == comparePerson):
                        countDiffs = 0
                    else:
                        print '   ' + compareFileName
                        compareSnpsAndAlleles = compareFile.get_all_snps_and_alleles(
                        )
                        compareSnpsAndAlleles = sorted(compareSnpsAndAlleles)
                        countDiffs = self.count_diffs(snpsAndAlleles,
                                                      compareSnpsAndAlleles)
                    rowOut[comparePerson] = countDiffs
                writer.writerow(rowOut)
                countOfSrcFiles += 1
        print "Wrote " + str(countOfSrcFiles) + " to " + self.filename
Esempio n. 2
0
    def write_one_person_to_file(self, srcFileName, writer):
        '''
        Gets the alleles for the risk snps from srcFile and writes them to the output file
        '''

        if (self.riskSnps.len() == 0):
            self.riskSnps.read_from_file()

        srcData = vcffile.VcfFile(srcFileName)
        personId = srcData.get_person_id()
        riskAlleles = srcData.get_these_risksnps(self.riskSnps)
        rowOut = {
            FIELD_INDEX: 0,
            FIELD_PERSONID: personId,
            FIELD_SNPID: 0,
            FIELD_ALLELE: 0,
            FIELD_ODDSRATIO: 0
        }
        index = 0
        riskSnpsThisPerson = 0
        for allele in riskAlleles:
            if (allele != '0'):
                rowOut[FIELD_INDEX] = index
                rowOut[FIELD_SNPID] = self.riskSnps.snps[index]
                rowOut[FIELD_ALLELE] = allele
                rowOut[FIELD_ODDSRATIO] = self.riskSnps.oddsratio[index]
                writer.writerow(rowOut)
                self._recordCount += 1
                riskSnpsThisPerson += 1
            index += 1
        print srcFileName + '  ' + str(riskSnpsThisPerson) + ' risk snps'
Esempio n. 3
0
 def test_get_an_allele_number(self):
     '''get_an_allele_number should convert from a character allele to a number'''
     inputfile = vcffile.VcfFile()
     alleleNumber = inputfile.get_an_allele_number('A', 'G')
     self.assertEqual('1', alleleNumber)
     alleleNumber = inputfile.get_an_allele_number('G', 'G')
     self.assertEqual('4', alleleNumber)
Esempio n. 4
0
 def test_get_an_allele(self):
     '''VcfFile.get_an_allele should return the snps allele'''
     self.assertTrue(os.path.exists(SAMPLEFILENAME))
     inputfile = vcffile.VcfFile(SAMPLEFILENAME)
     firstSnpLine = inputfile.get_first_snp_line()
     firstSnpAllele = inputfile.get_an_allele(firstSnpLine)
     self.assertEqual('A', firstSnpAllele)
Esempio n. 5
0
 def test_get_first_snp_line(self):
     '''VcfFile.get_first_snp_line should return the first snp in the file'''
     self.assertTrue(os.path.exists(SAMPLEFILENAME))
     inputfile = vcffile.VcfFile(SAMPLEFILENAME)
     firstSnpLine = inputfile.get_first_snp_line()
     firstSnp = inputfile.get_a_snp_id(firstSnpLine)
     self.assertEqual('rs12028261', firstSnp)
Esempio n. 6
0
 def test_vcffile_get_these_risksnps(self):
     '''
     VcfFile.get_these_risksnps should return the alleles for the specified snps
     '''
     riskSnps = risksnps.RiskSnps()
     riskSnps.set_snps([
         'rs102275', 'rs3764147', 'rs7927997', 'rs415890', 'rs4077515',
         'rs3810936', 'rs2476601', 'rs3792109'
     ])
     riskSnps.set_alleles(['C', 'G', 'T', 'C', 'T', 'C', 'G', 'A'])
     snpDataFile = vcffile.VcfFile(SAMPLEFILENAME)
     alleles = snpDataFile.get_these_risksnps(riskSnps)
     self.assertEqual(riskSnps.len(), len(alleles))
     self.assertEqual('4', alleles[0])
     self.assertEqual('4', alleles[1])
Esempio n. 7
0
 def get_one_person_from_file(self, srcFileName):
     '''
     Returns a comma separated string that's one row for the table:
     one person's risk alleles.
     '''
     print srcFileName
     srcData = vcffile.VcfFile(srcFileName)
     personId = srcData.get_person_id()
     rowOut = {FIELD_PERSONID:personId}
     riskAlleles = srcData.get_these_risksnps(self.riskSnps)
     riskSnpIndex = 0
     for allele in riskAlleles:
         riskSnp = self.riskSnps.snps[riskSnpIndex]
         rowOut[riskSnp] = allele
         riskSnpIndex += 1
     return rowOut
Esempio n. 8
0
 def test_get_these_risksnps(self):
     '''
     VcfFile.get_these_risksnps should return a list of allele numbers.
     Note that they will usually be 4s because 4 represents the risk
     allele and in this dataset, if a person has an allele that is different
     from the reference genome, and it is for one of the risk snps,
     it is usually, but not always the risk allele. 
     '''
     riskSnps = risksnps.RiskSnps()
     riskSnps.set_snps([
         'rs102275', 'rs3764147', 'rs7927997', 'rs415890', 'rs4077515',
         'rs3810936', 'rs2476601', 'rs3792109'
     ])
     riskSnps.set_alleles(['C', 'G', 'T', 'C', 'T', 'C', 'G', 'A'])
     inputfile = vcffile.VcfFile(SAMPLEFILENAME)
     alleleNumbers = inputfile.get_these_risksnps(riskSnps)
     self.assertEqual(riskSnps.len(), len(alleleNumbers))
     self.assertEqual('4', alleleNumbers[0])
     self.assertEqual('4', alleleNumbers[1])
Esempio n. 9
0
    def write_one_person_to_file(self, srcFileName, writer):
        '''
        Gets the alleles for the snps from srcFile and writes them to the output file
        '''

        srcData = vcffile.VcfFile(srcFileName)
        personId = srcData.get_person_id()
        snpsAndAlleles = srcData.get_all_snps_and_alleles()
        recordCount = 0
        rowOut = {FIELD_PERSONID: personId, FIELD_SNPID: 0, FIELD_ALLELE: 0}
        for snpAndAllele in snpsAndAlleles:
            if (snpAndAllele[1] != '0'):
                rowOut[FIELD_SNPID] = snpAndAllele[0]
                rowOut[FIELD_ALLELE] = snpAndAllele[1]
                writer.writerow(rowOut)
                #lineOut = personId + ',' + snpAndAllele[0] + ',' + snpAndAllele[1] + '\n'
                #destFile.write(lineOut)
            recordCount += 1
        print srcFileName + ' wrote ' + str(
            recordCount) + ' records to ' + self.filename
Esempio n. 10
0
 def test_person_id(self):
     '''VcfFile.get_person_id should pull the person id from the file name'''
     self.assertTrue(os.path.exists(SAMPLEFILENAME))
     inputfile = vcffile.VcfFile(SAMPLEFILENAME)
     personid = inputfile.get_person_id()
     self.assertEqual("A0024", personid)