Beispiel #1
0
def loadgensnpy(filename, source = None):
    '''
    Snpy wrapper, optional source as named param.
    
    source = one of ``"23andme"``, ``"vcf"``,
        ``"decodeme"`` or ``"ftdna"``;  
    '''
    snpy_generator = sn.parse(filename, source)
    return snpy_generator
Beispiel #2
0
def loadgensnpy(filename, source = None):
    '''
    Snpy wrapper, optional source as named param.
    
    source = one of ``"23andme"``, ``"vcf"``,
        ``"decodeme"`` or ``"ftdna"``;  
    '''
    snpy_generator = sn.parse(filename, source)
    return snpy_generator
Beispiel #3
0
    def write_ped(self, dirfilespheno, outputfile, pheno):
        """
        read the file inside a folder and parse to write a .ped.
        """

        for i in os.listdir(dirfilespheno):
            all_rs_ind_tmp = np.array(
                ["0 0"] * len(self.all_rs),
                dtype='S3')  #initialaze every genotype with "0 0"
            sex = ""
            all_names = []
            all_gen = []
            print dirfilespheno + "/" + i,
            if "XY" in i:
                sex = "1"
            elif "XX" in i:
                sex = "2"
            else:
                sex = "9"  #sex="other"
            try:
                snps = sn.parse(dirfilespheno + "/" + i)  #take another files
            except:
                print "   ERRO 1"
                continue
            try:
                for cur_snp in snps:
                    if len(
                            cur_snp.genotype
                    ) == 2 and cur_snp.genotype in self.valid_alleles:  # "--" and cur_snp.genotype != "II" and cur_snp.genotype != "DI":
                        all_names.append(cur_snp.name)
                        all_gen.append(
                            "%s %s" %
                            (cur_snp.genotype[0], cur_snp.genotype[1]))
            except:
                print "   ERRO 2"
                continue
            try:
                idx = np.flatnonzero(np.in1d(self.all_rs, np.array(all_names)))
            except:
                print "   ERRO 3"
                continue
            print ""
            all_rs_ind_tmp[idx] = np.array(all_gen)
            outputfile.write(
                str(self.Family_ID) + " " + str(self.Individual_ID) + " " +
                "0 0 " + sex + "  " + pheno + "  ")
            for i in all_rs_ind_tmp:
                outputfile.write(i + "  ")
            outputfile.write("\n")
            self.Family_ID = self.Family_ID + 1
            self.Individual_ID = self.Individual_ID + 1
Beispiel #4
0
def convert_genotype_to_hdf5(csv_content, output_file, source=None):
    log.info('Convert genotype from text format to HDF5 format %s' % (output_file))
    # guess the source
    fd = io.StringIO(csv_content)
    if source is None:
        source = sn.guess_source_from_content(csv_content)
    version = ''
    snps = sn.parse(fd, source)
    start_chr = None
    f = h5py.File(output_file, 'w')
    pos_snps = []
    num_snps = 0
    group = None  # Eliminates syntax error highlight in Eclipse.
    for snp in snps:
        num_snps += 1
        if snp.chromosome != start_chr:
            if start_chr is not None:
                sorted(pos_snps, key=lambda x: x[0])
                positions, ids, snps = zip(*pos_snps)
                group.create_dataset('ids', (len(positions),), chunks=True, compression='lzf', dtype='S15', data=ids)
                group.create_dataset('positions', (len(positions),), chunks=True, compression='lzf', dtype='i8', data=positions)
                group.create_dataset('snps', (len(positions),), chunks=True, compression='lzf', dtype='S2', data=snps)
                pos_snps = []
            start_chr = snp.chromosome
            group = f.create_group("Chr%s" % start_chr)
        pos_snps.append((int(snp.position), snp.name.encode('utf-8'), snp.genotype.encode('utf-8')))
    sorted(pos_snps, key=lambda x: x[0])
    positions, ids, snps = zip(*pos_snps)
    group.create_dataset('ids', (len(positions),), chunks=True, compression='lzf', dtype='S15', data=ids)
    group.create_dataset('positions', (len(positions),), chunks=True, compression='lzf', dtype='i8', data=positions)
    group.create_dataset('snps', (len(positions),), chunks=True, compression='lzf', dtype='S2', data=snps)
    # find out the version
    f.attrs['source'] = source
    # based on https://en.wikipedia.org/wiki/23andMe
    if source == '23andme':
        if num_snps <= 576000:
            version = 'v1'
        elif num_snps <= 597000:
            version = 'v2'
        # TODO current workaround because oauth genotype are > 1000000
        elif num_snps <= 611000 or num_snps > 1000000:
            version = 'v4'
        elif num_snps <= 992000:
            version = 'v3'
    f.attrs['version'] = version
    f.attrs['gender'] = 'm' if 'ChrY' in f.keys() else 'f'
    f.close()
Beispiel #5
0
def preprocess_genotype_tsv(input, source, output):
    """
    Preprocess input files from other source.

    :param input:
    :param source:
    :param output:
    :return:
    """
    snps = sn.parse(input, source)
    with open(output, 'w') as fh:
        for s in snps:
            rs = s.name
            if rs[0:2] != 'rs':
                continue
            ref, alt = split_genotype(s.genotype)
            if alt is None:
                continue
            fh.write('\t'.join([rs, ref, alt]) + '\n')
    return output
Beispiel #6
0
def update_index(seq_path, seq_id, dbsnp=None, index_root=None):
    """Update 'consequence' index with SNPs from a given file.

    Currently a file should either be in VCF, with a filename like
    ``*.vcf``, or in one of openSNP-provided formats, in that case,
    the name is expected to be left unmodified:
    ``user_id.format.submission_id``.

    If `dbsnp` argument is provided, only SNPs contained in `dbsnp`
    will be indexed.
    """
    g = Genome(base_path=index_root)

    dbsnp = {} if not dbsnp else \
        dict((r.POS, r.ID) for r in vcf.VCFReader(open(dbsnp, "rb")))

    for record in sn.parse(seq_path,
        source="vcf" if seq_path.endswith(".vcf") else None):

        # XXX here comes the punchline, VCF uses 1-based indexing,
        # so does SAM / BAM, but 'pysam' converts *all* indexes to
        # 0-based!
        chrom, pos = key = record.chromosome, record.position - 1

        if dbsnp and pos not in dbsnp:
            continue
        elif key not in g:
            # Note(Sergei): dbnSNP name defaults to '*'.
            g[key] = Chunk(dbsnp.get(pos, record.name or "*"))

        for alt in record.genotype:
            known = getattr(g[key], alt, None) or set()
            if seq_id in known:
                continue

            known.add(seq_id)
            g[key] = g[key]._replace(**{alt: known})

    g.dump()
Beispiel #7
0
def creat_map(output_file="output.map"):
    """Create the .map file"""
    input_file=raw_input("\n-------folders with files: ")
    dir_list=input_file.split(";")
    files=[]
    file_dir={}    
    for i in dir_list:
             files.extend(os.listdir(i)) #all files names in a dir
             for j in os.listdir(i):   
                file_dir[j]=i 
                #print j,i  


    rs_list=[]    
    map_list=[]
    idx=0

    print "\nParse the file ", file_dir[files[0]]+"/"+files[0],  

    try:
        snps = sn.parse(file_dir[files[0]]+"/"+files[0])#take the first file	        	
        for i in snps:              #initialaze rs_list and map_list with the first file
            rs_list.append(i[0])
            map_list.append((i[0],i[2],i[3]))
    except:
             print "   ERRO 1"             

    print "  OK"

    dtype = [('rs', 'S10'), ('chrom', 'S10'), ('position', int)]       
    map_list = np.array( map_list, dtype=dtype)

 
    for j in files[1:]:

        print "Parse the file  ",file_dir[j]+"/"+j, 
        
        try:	
            snps = sn.parse(file_dir[j]+"/"+j)       #take another files
            rs_list_tmp=[]
            map_list_tmp=[]
        except:
             print  "   ERRO 2"
             continue   

        try:	        	
          for i in snps:
            rs_list_tmp.append(i[0])
            map_list_tmp.append((i[0],i[2],i[3]))
        except:
             print "   ERRO 3"
             continue   

        try:	                                                        #erro 3 in this files for exemplo
            rs_list_dif=np.setdiff1d( rs_list_tmp,  rs_list)            #user36_file207_yearofbirth_1986_sex_XY.23andme-exome-vcf.txt
        except:                                                         #user36_file327_yearofbirth_1986_sex_XY.23andme-exome-vcf.txt
             print "   ERRO 4"
             continue   


        idx = np.flatnonzero(np.in1d(np.array(rs_list_tmp),np.array(rs_list_dif))) #return the index of the elements of "all_names" in "all_rs"
        map_list_tmp=np.array(map_list_tmp, dtype=dtype)             
        map_list=np.array(np.concatenate((  map_list, map_list_tmp[idx]  )) , dtype=dtype)

        print "  OK"
 
    print "sort..."
    map_list = np.sort(map_list, order=['chrom', 'position']) 

    chrommosomos=["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y","MT"]        


    ofile = open(output_file,'w')    # open file for writing
    
    print "write the output file..."
    for i in chrommosomos:
        if i in map_list['chrom']:

            idx = map_list['chrom']==i                                  

            for i in map_list[:][idx]:
                ofile.write(str(i[1])+" "+str(i[0])+" "+str(0)+" "+str(i[2])+"\n")
        
        
    ofile.close()
Beispiel #8
0

    sex=""

    print i
    if "XY" in i:
        sex="1"
        print "m"
    elif "XX" in i:
        sex="2"
        print "f"
    else:
        sex="other"
        print "o"

    snps = sn.parse(dir_files_phenotype1+"/"+i)       #take another files

    all_names=[]
    all_gen=[]

    for cur_snp in snps:
        if len(cur_snp.genotype)==2 and cur_snp.genotype != "--":
            all_names.append(cur_snp.name)
            all_gen.append("%s %s" % (cur_snp.genotype[0], cur_snp.genotype[1]))


    idx = np.flatnonzero(np.in1d(all_rs, np.array(all_names))) #return the index of the elements of "all_names" in "all_rs"
    all_rs_ind_tmp[idx] = np.array(all_gen)

   
    
Beispiel #9
0
    def create_map(self, folders_case, folders_control):
        """Create the .map file"""
        print "\n\nMAP FILE   (", time.asctime(), ")\n"
        dir_list = folders_case[:]
        dir_list.extend(folders_control)
        files = []
        file_dir = {}
        map_list = []
        idx = 0

        for i in dir_list:
            files.extend(os.listdir(i))  #it get all files names in a dir
            for j in os.listdir(i):
                file_dir[j] = i  #dictionari with file:dir

        print "Reading the files:\n\n", file_dir[
            files[0]] + "/" + files[0],  #parse the first file
        try:
            snps = sn.parse(file_dir[files[0]] + "/" +
                            files[0])  #take the first file
            for i in snps:  #initialaze rs_list and map_list with the first file
                map_list.append((i[0], i[2], i[3]))
        except:
            print "   ERRO 1"
        print ""

        dtype = [('rs', 'S10'), ('chrom', 'S10'), ('position', int)]
        map_list = np.array(map_list, dtype=dtype)

        for j in files[1:]:
            map_list_tmp = []
            print file_dir[j] + "/" + j,
            try:
                snps = sn.parse(file_dir[j] + "/" + j)  #take another files
            except:
                print "   ERRO 2"
                continue
            try:
                for i in snps:
                    map_list_tmp.append((i[0], i[2], i[3]))
            except:
                print "   ERRO 3"
                continue
            print ""

            map_list_tmp = np.array(map_list_tmp, dtype=dtype)
            map_list = np.array(np.concatenate((map_list, map_list_tmp)),
                                dtype=dtype)
            u, indices = np.unique(map_list['rs'], return_index=True)
            map_list = map_list[indices]

        array_chrom = np.unique(
            map_list['chrom']
        )  #add new elements to end of the self.chrommosomos
        idx_chr = np.in1d(array_chrom, self.chrommosomos)
        self.chrommosomos = np.concatenate(
            (self.chrommosomos, array_chrom[idx_chr == False]))

        map_list = np.sort(map_list, order=['chrom', 'position'])
        ofile = open(self.outputmap, 'w')  # open file for writing
        print "there are", len(
            map_list['rs']), "SNPs.\nwriting the", self.outputmap, "file..."
        for i in self.chrommosomos:
            if i in map_list['chrom']:
                idx = map_list['chrom'] == i
                for i in map_list[:][idx]:
                    ofile.write(
                        str(i[1]) + " " + str(i[0]) + " " + str(0) + " " +
                        str(i[2]) + "\n")
        ofile.close()