def loadgensnpy(filename, source = None): ''' Snpy wrapper, optional source as named param. source = one of ``"23andme"``, ``"vcf"``, ``"decodeme"`` or ``"ftdna"``; ''' snpy_generator = sn.parse(filename, source) return snpy_generator
def write_ped(self, dirfilespheno, outputfile, pheno): """ read the file inside a folder and parse to write a .ped. """ for i in os.listdir(dirfilespheno): all_rs_ind_tmp = np.array( ["0 0"] * len(self.all_rs), dtype='S3') #initialaze every genotype with "0 0" sex = "" all_names = [] all_gen = [] print dirfilespheno + "/" + i, if "XY" in i: sex = "1" elif "XX" in i: sex = "2" else: sex = "9" #sex="other" try: snps = sn.parse(dirfilespheno + "/" + i) #take another files except: print " ERRO 1" continue try: for cur_snp in snps: if len( cur_snp.genotype ) == 2 and cur_snp.genotype in self.valid_alleles: # "--" and cur_snp.genotype != "II" and cur_snp.genotype != "DI": all_names.append(cur_snp.name) all_gen.append( "%s %s" % (cur_snp.genotype[0], cur_snp.genotype[1])) except: print " ERRO 2" continue try: idx = np.flatnonzero(np.in1d(self.all_rs, np.array(all_names))) except: print " ERRO 3" continue print "" all_rs_ind_tmp[idx] = np.array(all_gen) outputfile.write( str(self.Family_ID) + " " + str(self.Individual_ID) + " " + "0 0 " + sex + " " + pheno + " ") for i in all_rs_ind_tmp: outputfile.write(i + " ") outputfile.write("\n") self.Family_ID = self.Family_ID + 1 self.Individual_ID = self.Individual_ID + 1
def convert_genotype_to_hdf5(csv_content, output_file, source=None): log.info('Convert genotype from text format to HDF5 format %s' % (output_file)) # guess the source fd = io.StringIO(csv_content) if source is None: source = sn.guess_source_from_content(csv_content) version = '' snps = sn.parse(fd, source) start_chr = None f = h5py.File(output_file, 'w') pos_snps = [] num_snps = 0 group = None # Eliminates syntax error highlight in Eclipse. for snp in snps: num_snps += 1 if snp.chromosome != start_chr: if start_chr is not None: sorted(pos_snps, key=lambda x: x[0]) positions, ids, snps = zip(*pos_snps) group.create_dataset('ids', (len(positions),), chunks=True, compression='lzf', dtype='S15', data=ids) group.create_dataset('positions', (len(positions),), chunks=True, compression='lzf', dtype='i8', data=positions) group.create_dataset('snps', (len(positions),), chunks=True, compression='lzf', dtype='S2', data=snps) pos_snps = [] start_chr = snp.chromosome group = f.create_group("Chr%s" % start_chr) pos_snps.append((int(snp.position), snp.name.encode('utf-8'), snp.genotype.encode('utf-8'))) sorted(pos_snps, key=lambda x: x[0]) positions, ids, snps = zip(*pos_snps) group.create_dataset('ids', (len(positions),), chunks=True, compression='lzf', dtype='S15', data=ids) group.create_dataset('positions', (len(positions),), chunks=True, compression='lzf', dtype='i8', data=positions) group.create_dataset('snps', (len(positions),), chunks=True, compression='lzf', dtype='S2', data=snps) # find out the version f.attrs['source'] = source # based on https://en.wikipedia.org/wiki/23andMe if source == '23andme': if num_snps <= 576000: version = 'v1' elif num_snps <= 597000: version = 'v2' # TODO current workaround because oauth genotype are > 1000000 elif num_snps <= 611000 or num_snps > 1000000: version = 'v4' elif num_snps <= 992000: version = 'v3' f.attrs['version'] = version f.attrs['gender'] = 'm' if 'ChrY' in f.keys() else 'f' f.close()
def preprocess_genotype_tsv(input, source, output): """ Preprocess input files from other source. :param input: :param source: :param output: :return: """ snps = sn.parse(input, source) with open(output, 'w') as fh: for s in snps: rs = s.name if rs[0:2] != 'rs': continue ref, alt = split_genotype(s.genotype) if alt is None: continue fh.write('\t'.join([rs, ref, alt]) + '\n') return output
def update_index(seq_path, seq_id, dbsnp=None, index_root=None): """Update 'consequence' index with SNPs from a given file. Currently a file should either be in VCF, with a filename like ``*.vcf``, or in one of openSNP-provided formats, in that case, the name is expected to be left unmodified: ``user_id.format.submission_id``. If `dbsnp` argument is provided, only SNPs contained in `dbsnp` will be indexed. """ g = Genome(base_path=index_root) dbsnp = {} if not dbsnp else \ dict((r.POS, r.ID) for r in vcf.VCFReader(open(dbsnp, "rb"))) for record in sn.parse(seq_path, source="vcf" if seq_path.endswith(".vcf") else None): # XXX here comes the punchline, VCF uses 1-based indexing, # so does SAM / BAM, but 'pysam' converts *all* indexes to # 0-based! chrom, pos = key = record.chromosome, record.position - 1 if dbsnp and pos not in dbsnp: continue elif key not in g: # Note(Sergei): dbnSNP name defaults to '*'. g[key] = Chunk(dbsnp.get(pos, record.name or "*")) for alt in record.genotype: known = getattr(g[key], alt, None) or set() if seq_id in known: continue known.add(seq_id) g[key] = g[key]._replace(**{alt: known}) g.dump()
def creat_map(output_file="output.map"): """Create the .map file""" input_file=raw_input("\n-------folders with files: ") dir_list=input_file.split(";") files=[] file_dir={} for i in dir_list: files.extend(os.listdir(i)) #all files names in a dir for j in os.listdir(i): file_dir[j]=i #print j,i rs_list=[] map_list=[] idx=0 print "\nParse the file ", file_dir[files[0]]+"/"+files[0], try: snps = sn.parse(file_dir[files[0]]+"/"+files[0])#take the first file for i in snps: #initialaze rs_list and map_list with the first file rs_list.append(i[0]) map_list.append((i[0],i[2],i[3])) except: print " ERRO 1" print " OK" dtype = [('rs', 'S10'), ('chrom', 'S10'), ('position', int)] map_list = np.array( map_list, dtype=dtype) for j in files[1:]: print "Parse the file ",file_dir[j]+"/"+j, try: snps = sn.parse(file_dir[j]+"/"+j) #take another files rs_list_tmp=[] map_list_tmp=[] except: print " ERRO 2" continue try: for i in snps: rs_list_tmp.append(i[0]) map_list_tmp.append((i[0],i[2],i[3])) except: print " ERRO 3" continue try: #erro 3 in this files for exemplo rs_list_dif=np.setdiff1d( rs_list_tmp, rs_list) #user36_file207_yearofbirth_1986_sex_XY.23andme-exome-vcf.txt except: #user36_file327_yearofbirth_1986_sex_XY.23andme-exome-vcf.txt print " ERRO 4" continue idx = np.flatnonzero(np.in1d(np.array(rs_list_tmp),np.array(rs_list_dif))) #return the index of the elements of "all_names" in "all_rs" map_list_tmp=np.array(map_list_tmp, dtype=dtype) map_list=np.array(np.concatenate(( map_list, map_list_tmp[idx] )) , dtype=dtype) print " OK" print "sort..." map_list = np.sort(map_list, order=['chrom', 'position']) chrommosomos=["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y","MT"] ofile = open(output_file,'w') # open file for writing print "write the output file..." for i in chrommosomos: if i in map_list['chrom']: idx = map_list['chrom']==i for i in map_list[:][idx]: ofile.write(str(i[1])+" "+str(i[0])+" "+str(0)+" "+str(i[2])+"\n") ofile.close()
sex="" print i if "XY" in i: sex="1" print "m" elif "XX" in i: sex="2" print "f" else: sex="other" print "o" snps = sn.parse(dir_files_phenotype1+"/"+i) #take another files all_names=[] all_gen=[] for cur_snp in snps: if len(cur_snp.genotype)==2 and cur_snp.genotype != "--": all_names.append(cur_snp.name) all_gen.append("%s %s" % (cur_snp.genotype[0], cur_snp.genotype[1])) idx = np.flatnonzero(np.in1d(all_rs, np.array(all_names))) #return the index of the elements of "all_names" in "all_rs" all_rs_ind_tmp[idx] = np.array(all_gen)
def create_map(self, folders_case, folders_control): """Create the .map file""" print "\n\nMAP FILE (", time.asctime(), ")\n" dir_list = folders_case[:] dir_list.extend(folders_control) files = [] file_dir = {} map_list = [] idx = 0 for i in dir_list: files.extend(os.listdir(i)) #it get all files names in a dir for j in os.listdir(i): file_dir[j] = i #dictionari with file:dir print "Reading the files:\n\n", file_dir[ files[0]] + "/" + files[0], #parse the first file try: snps = sn.parse(file_dir[files[0]] + "/" + files[0]) #take the first file for i in snps: #initialaze rs_list and map_list with the first file map_list.append((i[0], i[2], i[3])) except: print " ERRO 1" print "" dtype = [('rs', 'S10'), ('chrom', 'S10'), ('position', int)] map_list = np.array(map_list, dtype=dtype) for j in files[1:]: map_list_tmp = [] print file_dir[j] + "/" + j, try: snps = sn.parse(file_dir[j] + "/" + j) #take another files except: print " ERRO 2" continue try: for i in snps: map_list_tmp.append((i[0], i[2], i[3])) except: print " ERRO 3" continue print "" map_list_tmp = np.array(map_list_tmp, dtype=dtype) map_list = np.array(np.concatenate((map_list, map_list_tmp)), dtype=dtype) u, indices = np.unique(map_list['rs'], return_index=True) map_list = map_list[indices] array_chrom = np.unique( map_list['chrom'] ) #add new elements to end of the self.chrommosomos idx_chr = np.in1d(array_chrom, self.chrommosomos) self.chrommosomos = np.concatenate( (self.chrommosomos, array_chrom[idx_chr == False])) map_list = np.sort(map_list, order=['chrom', 'position']) ofile = open(self.outputmap, 'w') # open file for writing print "there are", len( map_list['rs']), "SNPs.\nwriting the", self.outputmap, "file..." for i in self.chrommosomos: if i in map_list['chrom']: idx = map_list['chrom'] == i for i in map_list[:][idx]: ofile.write( str(i[1]) + " " + str(i[0]) + " " + str(0) + " " + str(i[2]) + "\n") ofile.close()