def check_path(): if not isdir(my_parser().filePath): return bomb(f'No files found at path = {my_parser().filePath}\n') elif isdir(my_parser().filePath) and \ not [item for item in flatten(file_list) if item.endswith(('vcf', 'vcf.gz', 'ped', 'map'))]: return bomb( f'No vcf or plink files found at path = {abspath(my_parser().filePath)}\n' )
genotypes.append(row) elif toto == 2: for n, a in enumerate(geno): try: geno_tot.append(plink_ACGT[plink_info[n][0]][int(a)]) except KeyError: pass geno_out.write( '%s %s \n' % (' '.join([sample_id, sample_id, '0', '0', '2', '-9' ]), ' '.join(geno_tot))) # continue writing VCF if toto == 1: for row in zip(vcfsnps, [*zip(*genotypes)]): line = [list(flatten(i)) for i in list(row)] geno_out.write('\t'.join(flatten(line)) + '\n') # Map file for Ped if toto == 2: map_out = open(fo + ".map", "w") map_ped = [] for marker in snps: # snps is a list need to ammend this try: ms = vcf_a1a2[marker][0:3] ms.insert(0, '0') ms = [ms[i] for i in [1, 3, 0, 2]] map_ped.append(ms) except KeyError: ms = snps[marker] ms.insert(0, '0')
within_file_sample_dups = {} # key=file, v = duplicated sample between_file_sample_dups = {} # key=file, v = file_samples between_file_chroms = {} # key=file, v = set(file_chroms) for index, file_ in enumerate(file_list): if str(file_list[index]).endswith("vcf.gz"): command0 = "zcat< " + file_list[index] + "|grep -m 1 '#CHR' |cut -f10-" command1 = "zcat< " + file_list[index] + "| grep -v '#'|cut -f1-2" chroms = [i[0] for i in std_capture(command1)] snps = [':'.join(i) for i in std_capture(command1)] within_file_snps_dups[file_list[index]] = snps between_file_chroms[file_list[index]] = set(chroms) samples = list(flatten(std_capture(command0))) if len(samples) > len(set(samples)): within_file_sample_dups[file_list[index]] = set([ x for x in samples if samples.count(x) > 1 ]) # multimode(samples) between_file_sample_dups[file_list[index]] = set(samples) if str(file_list[index]).endswith("vcf"): command0 = "cat " + file_list[index] + "|grep -m 1 '#CHR' |cut -f10-" command1 = "cat " + file_list[index] + "| grep -v '#'|cut -f1-2" chroms = [i[0] for i in std_capture(command1)] snps = [':'.join(i) for i in std_capture(command1)] within_file_snps_dups[file_list[index]] = snps between_file_chroms[file_list[index]] = set(chroms)
except ValueError: chrom, snpname, cm, pos = line.strip().split() key = chrom + ":" + pos if key not in markers: markers[key] = [snpname] + mark_chip if key in markers: value = list(markers[key]) value[chip + 1] = snp_number + 1 markers[key] = value # Create a comprehensive snps file # Sort the snps by chrom then pos and write index mark = [] for row in [list(flatten(i)) for i in list(markers.items())]: row.insert(0, row[1]) row.insert(1, row[1].split(":")[0]) row.insert(2, row[2].split(":")[1]) row.pop(3) row.pop(3) mark.append(row) ''' for row in mark: print(row) ''' mark = sorted(mark, key=lambda x: (int(x[1]), int(x[2]))) f_out = "snpinfo.txt" mark_out = open(f_out, "w") mark_out.write('marker bta pos chips...\n')
geno = [] chip = index + 1 snp_number = 0 with open_by_suffix(file_list[index]) as f: for line in f: if "##" in line: continue # CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,samples... bta, pos, snp, ref, alt, qual, filter_, info, format_, genotype = line.strip( ).split("\t", 9) bta = bta.replace('Chr', '') geno.append( list( flatten([ vcf_2_fimpute.get(i, i) for i in [ ii.split(':')[0].replace("|", "/") for ii in genotype.split('\t') ] ]))) if bta != '#CHROM ' and pos != 'POS': if bta + ":" + pos not in snps_list: snps_list[bta + ":" + pos] = [bta, snp, "0", pos, ref, alt] elif bta + ":" + pos in snps_list and snps_list[ bta + ":" + pos][4].lower() != ref.lower(): print( 'Some REF/ALT may be flipped \n' 'Normalize VCF files e.g. `bcftools norm -f [REF_GENOME] ... `\n' ) raise SystemExit