words = line.split()
    chr = str(words[0].split('_')[1])
    pos = words[1]

    # split chromosomes into separate files
    if chr != chrPrev:
      fileoutput.close()
      fileoutput = open(chr+'_'+args.output, 'w')
      chrPrev = chr
      siteNumber = 1

    # select samples
    genotypes = calls.selectSamples(sampCol, words)

    # count Ns
    valueN = calls.countPerPosition(genotypes, 'N')

    if valueN <= args.missing:
      genotypesMerged = ''.join(str(e) for e in genotypes)
      genotypesMergedP = genotypesMerged.replace('N', '?')
    else:
      continue

    # count the number of called sites
    if not calls.is_polymorphic(genotypes):
      siteNumber += 1
    else:
      fileoutput.write("%s\t%s\t%s\t%s\n" % (chr, pos, siteNumber, genotypesMergedP))
      siteNumber = 1

    # track progress
Beispiel #2
0
                HeterWindow = round(meanWindow(Hwindow, Twindow), 4)
            except Exception:
                HeterWindow = "NA"
            calls.processWindow(Chr, posS, posE, HeterWindow, outputFile)
            windPosEnd = windPosEnd + windSize
            Hwindow = []
            Twindow = []
            posS = pos
            while pos > windPosEnd:  # if the gap in positions is larger than window size
                windPosEnd = windPosEnd + windSize

        ChrPrevious = Chr
        posE = pos

        # count hetero
        Nmising = calls.countPerPosition(sample_charaters, 'N')
        if Nmising < allowedN:  # skip if too many Ns
            nHerer = calls.countHeteroPerPosition(sample_charaters)
            nTotal = float(nSample - Nmising)
            Hwindow.append(float(nHerer))
            Twindow.append(float(nTotal))

        # track progress
        counter += 1
        if counter % 1000000 == 0:
            print str(counter), "lines processed"

# process the last window
try:
    HeterWindow = round(meanWindow(Hwindow, Twindow), 4)
except Exception:
Beispiel #3
0
        words = line.split()
        chr = str(words[0].split('_')[1])
        pos = words[1]

        # split chromosomes into separate files
        if chr != chrPrev:
            fileoutput.close()
            fileoutput = open(chr + '_' + args.output, 'w')
            chrPrev = chr
            siteNumber = 1

        # select samples
        genotypes = calls.selectSamples(sampCol, words)

        # count Ns
        valueN = calls.countPerPosition(genotypes, 'N')

        if valueN <= args.missing:
            genotypesMerged = ''.join(str(e) for e in genotypes)
            genotypesMergedP = genotypesMerged.replace('N', '?')
        else:
            continue

        # count the number of called sites
        if not calls.is_polymorphic(genotypes):
            siteNumber += 1
        else:
            fileoutput.write("%s\t%s\t%s\t%s\n" %
                             (chr, pos, siteNumber, genotypesMergedP))
            siteNumber = 1
  for line in datafile:
    # track progress
    counter += 1
    if counter % 1000000 == 0:
      print str(counter), "lines processed"

    words = line.split()
    chr_pos = words[0:2]
    ch = int(words[0].split('_')[1])
    pos = int(words[1])

     # select samples
    alleles = calls.selectSamples(sampCol, words)

    # count Ns
    valueN = calls.countPerPosition(alleles, 'N')

    if valueN <= args.missing:
      Allalleles = [i for i in alleles if i != 'N']
    else:
      continue

    # find overlap with the ancestor
    while ch > ref_ch or (ch == ref_ch and pos > ref_pos):
      words2 = ref.readline().split()
      if words2 == []:
        ancest = 'N'
        break
      else:
        ref_chr_pos = words2[0:2]
        ref_ch = int(ref_chr_pos[0].split('_')[1])
    sampColnames = calls.selectSamples(sampCol, header_words)
    sampNs = [0 for i in sampColnames]

    print('Counting Ns ...')

    Ns = []

    for line in datafile:
        words = line.split()
        chr_pos = words[0:2]

        # select samples
        sample_charaters = calls.selectSamples(sampCol, words)

        # count Ns per position
        contNsOnly = calls.countPerPosition(sample_charaters, 'N')
        Ns.append(contNsOnly)

        # count Ns per sample
        calls.countPerSample(sample_charaters, sampNs, 'N')

        # track progress
        counter += 1
        if counter % 1000000 == 0:
            print str(counter), "lines processed"

datafile.close()

# write the counts to a fine
outputTXTsite = open(args.output + "_Ns_per_site.csv", 'w')
outputTXTsample = open(args.output + "_Ns_per_sample.csv", 'w')
    for line in datafile:
        words = line.split()
        ch = int(words[0].split('_')[1])
        pos = int(words[1])

        # track progress
        counter += 1
        if counter % 1000000 == 0:
            print str(counter), "lines processed"

        # select samples
        genotypesN = calls.selectSamples(sampCol, words)

        # skip sites with missing data
        numN = calls.countPerPosition(genotypesN, 'N')
        if numN <= AlowedN:
            genotypes = [i for i in genotypesN if i != 'N']
            nGenotypes = len(genotypes)
        else:
            continue  # skip lines with too many Ns

        # count alleles
        numAl = collections.Counter(genotypes)
        numAlM = numAl.most_common()

        while ch > ref_ch or (ch == ref_ch and pos > ref_pos):
            words2 = ref.readline().split()
            if words2 == []:
                ancest = 'N'
                break