Beispiel #1
0
def convert_cols_to_lines(p_source_dir, p_source_file, p_destination_dir,
                          p_dest_file_list, p_individualsposlist,
                          p_gen_column):

    utils.log(logger, "Begin - convert_gen_cols_to_ind_lines - ")
    positionindex = p_individualsposlist.index(p_gen_column)
    regex = r"^{0}.*{1}$".format(
        p_destination_dir + "IND_" + str(positionindex + 1) + "_",
        destination_file_type)

    p_indfilename = utils.find_file_in_list(p_dest_file_list, regex)
    source_file = utils.get_file_name(str(p_source_file))

    try:

        col = int(p_gen_column)

    except:
        e = sys.exc_info()[0]
        utils.log(logger, e)

    #Open individuals file
    with open(p_indfilename, 'a') as indfile:

        utils.log(logger, "Writing IND .tsv file: " + p_indfilename)

        csvwriter = csv.writer(indfile,
                               delimiter='\t',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)
        sequence = 0

        with gzip.open(p_source_dir + p_source_file, 'rb') as genfile:
            for line in genfile:  #reads line by line .gen file.

                #readlines() loads full .gen file into memory and split in lines. To many threads
                # or very big files can cause memory overflow.
                #for line in genfile.readlines():

                sequence = sequence + 1

                seq = str(sequence).split()
                columns = line.split()

                csvwriter.writerow([source_file] + seq + columns[col:col + 3])

            indfile.close()

            utils.log(logger, "Lines in source file: " + str(sequence))

        genfile.close()
        utils.log(logger, "End - convert_gen_cols_to_ind_lines - ")
        return
Beispiel #2
0
def convert_cols_to_lines(p_source_dir,p_source_file,p_destination_dir,p_dest_file_list, p_individualsposlist, p_gen_column):

           utils.log(logger,"Begin - convert_gen_cols_to_ind_lines - ")           
           positionindex = p_individualsposlist.index(p_gen_column)
           regex =  r"^{0}.*{1}$".format(p_destination_dir+"IND_"+str(positionindex+1)+"_",destination_file_type)
           
           p_indfilename = utils.find_file_in_list(p_dest_file_list,regex)
           source_file = utils.get_file_name(str(p_source_file))

           try:
          
              col = int(p_gen_column)

           except:
              e = sys.exc_info()[0]
              utils.log(logger,e)


           #Open individuals file  
           with open(p_indfilename,'a') as indfile:         
            
            
             utils.log(logger,"Writing IND .tsv file: "+ p_indfilename)
            
             csvwriter = csv.writer(indfile,delimiter='\t',quotechar='"', quoting=csv.QUOTE_MINIMAL)
             sequence = 0 
          
             with gzip.open(p_source_dir+p_source_file,'rb') as genfile:
                for line in genfile: #reads line by line .gen file.

                 #readlines() loads full .gen file into memory and split in lines. To many threads 
                 # or very big files can cause memory overflow. 
                 #for line in genfile.readlines():
          
                   sequence=sequence+1

                   seq=str(sequence).split() 
                   columns=line.split()
                        
                   csvwriter.writerow([source_file]+seq+columns[col:col+3])
              
                indfile.close()
                                 
                utils.log(logger,"Lines in source file: "+ str(sequence))

             genfile.close()
             utils.log(logger,"End - convert_gen_cols_to_ind_lines - ")           
             return     
Beispiel #3
0
batchPlaceholders = utils.make_placeholders(subarrayPos, nbBatches)
headerPlaceholders = utils.make_headers("batch", 0, nbBatches)
f = cursor.fetchone()
current_id = -1
migratedindividuals=0

utils.log(logger, "Writing <<Individuals>> files")
utils.log(logger, "------------------------------------")

while f:
	current_id = f[0]

	if importGenotypesFlag == "Y":

 		regex =  r"^SAM_[0-9]+_{0}_.*{1}$".format(current_id,file_type_oxford)
                filename =  utils.find_file_in_list(sourcesFileList, regex)
                familyID = ''
                fatherID = ''
		motherID = ''

                if filename == None:
                        utils.log(logger, "ID: "+str(current_id) +" -Not found in SAMPLE file")
		else:

			os.makedirs(output_dir + str(current_id))

			with open("{0}{1}/Individuals_{1}.tsv".format(output_dir, current_id), "w") as file:
				file.write("individualId,familyId,paternalId,maternalId,dateOfBirth,gender,ethnicCode,centreName,region,country,notes,missingCallFreq,{0}\n".format(headerPlaceholders))
	      		        familyID=utils.get_individual_info(sourcesDirectory+filename, "ID_2")
		                fatherID=utils.get_individual_info(sourcesDirectory+filename, "father")                    
		                motherID=utils.get_individual_info(sourcesDirectory+filename, "mother")
Beispiel #4
0
batchPlaceholders = utils.make_placeholders(subarrayPos, nbBatches)
headerPlaceholders = utils.make_headers("batch", 0, nbBatches)
f = cursor.fetchone()
current_id = -1
migratedindividuals=0

utils.log(logger, "Writing <<Individuals>> files")
utils.log(logger, "------------------------------------")

while f:
     current_id = f[0]

     if importGenotypesFlag == "Y":

                regex =  r"^SAM_[0-9]+_{0}_.*{1}$".format(current_id,file_type_oxford)
                filename =  utils.find_file_in_list(sourcesFileList, regex)
                familyID = ''
                fatherID = ''
                motherID = ''

                if filename == None:
                        utils.log(logger, "ID: "+str(current_id) +" -Not found in SAMPLE file")
                else:

                  os.makedirs(output_dir + str(current_id))

                  with open("{0}{1}/Individuals_{1}.tsv".format(output_dir, current_id), "w") as file:
                    file.write("individualId,familyId,paternalId,maternalId,dateOfBirth,gender,ethnicCode,centreName,region,country,notes,missingCallFreq,{0}\n".format(headerPlaceholders))
                    familyID=utils.get_individual_info(sourcesDirectory+filename, "ID_2")
                    fatherID=utils.get_individual_info(sourcesDirectory+filename, "father")                    
                    motherID=utils.get_individual_info(sourcesDirectory+filename, "mother")