############################# program ############################# counter = 0 # read the header ances = open(args.ancestral, 'r') ances_words = ances.readline() output = open(args.output, 'w') print('Opening the file...') with open(args.input) as datafile: header_words = datafile.readline().split() sampCol = calls.indexSamples(sampleNames, header_words) print('Creating the output file...') samples_head = calls.selectSamples(sampCol, header_words) samples_headP = '\t'.join(str(e) for e in samples_head) output.write('%s\t%s\t%s\t%s\t%s\n' % ('CHROM', 'POS', 'ANC', 'DER', samples_headP)) # read the second line of the ancestral file ances_words = ances.readline().split() if '_' in ances_words[0]: ances_ch = int(ances_words[0].split('_')[1]) else: ances_ch = int(ances_words[0].split('chr')[1]) ances_pos = int(ances_words[1]) ances_gt = splitAncestral(ances_words[2])
sampleNames = calls.checkSampleNames(args.samples, args.input) ############################# program ############################# counter = 0 siteNumber = 1 chrPrev = str(1) print('Opening the file...') with open(args.input) as datafile: header_line = datafile.readline() header_words = header_line.split() # index samples sampCol = calls.indexSamples(sampleNames, header_words) print('Creating the output file...') fileoutput = open(chrPrev+'_'+args.output, 'w') for line in datafile: words = line.split() chr = str(words[0].split('_')[1]) pos = words[1] # split chromosomes into separate files if chr != chrPrev: fileoutput.close() fileoutput = open(chr+'_'+args.output, 'w') chrPrev = chr siteNumber = 1
counter = 0 print('Opening the file...') with open(args.input) as datafile: header_words = datafile.readline().split() # make output header print('Creating the output file...') fileoutput = open(args.output, 'w') popsP = '\t'.join(str(w) for w in pops) fileoutput.write("%s\n" % popsP) for popName in pops: # index samples vars()[popName + "Index"] = calls.indexSamples( vars()[popName + "samples"], header_words) for line in datafile: words = line.split() GT = words[2:] GTpair = [i for i in list(set(GT)) if i != 'N'] # get the set of alleles, skip missing alleles popNum = 0 #if ("N" not in GT) and (len(GTpair) == 2) : # skip missing data or non-biallelic if (len(GTpair) == 2): # skip non-biallelic for popName in pops: popNum += 1 # to make correct output. See below # select genotypes sGT = calls.selectSamples(vars()[popName + "Index"], words)
############################# program ############################# def all_same(items): return all(x == items[0] for x in items[1:]) counter = 0 output = open(args.output, 'w') output.write("#CHR\tPOS\tCommon_alleles\tRare_alleles\n") print('Opening the file...') with open(args.input) as datafile: header_words = datafile.readline().split() # index samples sIndex = calls.indexSamples(sNames, header_words) # create lists for output sNames = calls.selectSamples(sIndex, header_words) for line in datafile: words = line.split() chr_pos = words[0:2] chr_posP = '\t'.join(str(e) for e in chr_pos) # select samples sGT = calls.selectSamples(sIndex, words) # define two or one character code if all(len(i) == 1 for i in sGT): alleles = calls.OneToTwo(sGT)
parser.add_argument('-s', '--samples', help = 'column names of the samples to process (optional)', type=str, required=False) args = parser.parse_args() # check if samples names are given and if all sample names are present in a header sampleNames = calls.checkSampleNames(args.samples, args.tab) ############################# program ############################# print('Opening the file...') counter = 0 siftFile = open(args.annotation, 'r') annotOptions = siftFile.readline().split() fieldsNames = args.fields.split(',') fieldsIndex = calls.indexSamples(fieldsNames, annotOptions) sift_words = siftFile.readline().split() sift_chr = int(sift_words[0].split('_')[1]) sift_pos = int(sift_words[1]) with open(args.tab) as datafile: header_words = datafile.readline().split() # index samples sampCol = calls.indexSamples(sampleNames, header_words) # make output header print('Creating the output file...') output = open(args.output, 'w') ouput_header = header_words[0:2] + calls.selectSamples(sampCol, header_words)
outputPhy.write(' %s %s\n' % (NumberSamp, NumberPos)) # process one sample per time to reduce RAM usage for sample in sampleNames: # write sample name into file outputFasta.write(">%s\n" % sample) outputPhy.write("%s " % sample) fastaLim = 0 # counter to split sequence in multi-line fasta with open(args.input) as datafile: header_words = datafile.readline().split() # index a sample sampCol = calls.indexSamples([sample], header_words) for line in datafile: words = line.split() genotype = calls.selectSamples(sampCol, words) # output only single nucleotide genotypes, insertions are replaced with N. if len(genotype) == 1: outputFasta.write(genotype[0]) outputPhy.write(genotype[0]) else: outputFasta.write('N') outputPhy.write('N') # to split sequence in multi-line fasta
for i in familyNames.strip("\"").split(";"): famName = i.split("[")[0] famSample = re.split("\[|\]|", i)[1] Fsamples.append(famSample.split(",")) familySamples[famName] = calls.checkSampleNames(famSample, args.input) samples = calls.flattenList(Fsamples) calls.checkSampleNames('ANC', args.input) calls.checkSampleNames('DER', args.input) ############################# program ############################# outputSNPs = open(args.output + '.snps', 'w') with open(args.input) as datafile: header_words = datafile.readline().split() sampleIndex = calls.indexSamples(samples, header_words) ANCindex = calls.indexSamples(['ANC'], header_words) DERindex = calls.indexSamples(['DER'], header_words) FamilyIndex = {} phasedLines = {} for family in familySamples: FamilyIndex[family] = calls.indexSamples(familySamples[family], header_words) phasedLines[family] = [] for line in datafile: words = line.split() CHR = words[0] POS = words[1] chr_pos = CHR + '_' + POS