def convert_cols_to_lines(p_source_dir, p_source_file, p_destination_dir, p_dest_file_list, p_individualsposlist, p_gen_column): utils.log(logger, "Begin - convert_gen_cols_to_ind_lines - ") positionindex = p_individualsposlist.index(p_gen_column) regex = r"^{0}.*{1}$".format( p_destination_dir + "IND_" + str(positionindex + 1) + "_", destination_file_type) p_indfilename = utils.find_file_in_list(p_dest_file_list, regex) source_file = utils.get_file_name(str(p_source_file)) try: col = int(p_gen_column) except: e = sys.exc_info()[0] utils.log(logger, e) #Open individuals file with open(p_indfilename, 'a') as indfile: utils.log(logger, "Writing IND .tsv file: " + p_indfilename) csvwriter = csv.writer(indfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) sequence = 0 with gzip.open(p_source_dir + p_source_file, 'rb') as genfile: for line in genfile: #reads line by line .gen file. #readlines() loads full .gen file into memory and split in lines. To many threads # or very big files can cause memory overflow. #for line in genfile.readlines(): sequence = sequence + 1 seq = str(sequence).split() columns = line.split() csvwriter.writerow([source_file] + seq + columns[col:col + 3]) indfile.close() utils.log(logger, "Lines in source file: " + str(sequence)) genfile.close() utils.log(logger, "End - convert_gen_cols_to_ind_lines - ") return
def convert_cols_to_lines(p_source_dir,p_source_file,p_destination_dir,p_dest_file_list, p_individualsposlist, p_gen_column): utils.log(logger,"Begin - convert_gen_cols_to_ind_lines - ") positionindex = p_individualsposlist.index(p_gen_column) regex = r"^{0}.*{1}$".format(p_destination_dir+"IND_"+str(positionindex+1)+"_",destination_file_type) p_indfilename = utils.find_file_in_list(p_dest_file_list,regex) source_file = utils.get_file_name(str(p_source_file)) try: col = int(p_gen_column) except: e = sys.exc_info()[0] utils.log(logger,e) #Open individuals file with open(p_indfilename,'a') as indfile: utils.log(logger,"Writing IND .tsv file: "+ p_indfilename) csvwriter = csv.writer(indfile,delimiter='\t',quotechar='"', quoting=csv.QUOTE_MINIMAL) sequence = 0 with gzip.open(p_source_dir+p_source_file,'rb') as genfile: for line in genfile: #reads line by line .gen file. #readlines() loads full .gen file into memory and split in lines. To many threads # or very big files can cause memory overflow. #for line in genfile.readlines(): sequence=sequence+1 seq=str(sequence).split() columns=line.split() csvwriter.writerow([source_file]+seq+columns[col:col+3]) indfile.close() utils.log(logger,"Lines in source file: "+ str(sequence)) genfile.close() utils.log(logger,"End - convert_gen_cols_to_ind_lines - ") return
batchPlaceholders = utils.make_placeholders(subarrayPos, nbBatches) headerPlaceholders = utils.make_headers("batch", 0, nbBatches) f = cursor.fetchone() current_id = -1 migratedindividuals=0 utils.log(logger, "Writing <<Individuals>> files") utils.log(logger, "------------------------------------") while f: current_id = f[0] if importGenotypesFlag == "Y": regex = r"^SAM_[0-9]+_{0}_.*{1}$".format(current_id,file_type_oxford) filename = utils.find_file_in_list(sourcesFileList, regex) familyID = '' fatherID = '' motherID = '' if filename == None: utils.log(logger, "ID: "+str(current_id) +" -Not found in SAMPLE file") else: os.makedirs(output_dir + str(current_id)) with open("{0}{1}/Individuals_{1}.tsv".format(output_dir, current_id), "w") as file: file.write("individualId,familyId,paternalId,maternalId,dateOfBirth,gender,ethnicCode,centreName,region,country,notes,missingCallFreq,{0}\n".format(headerPlaceholders)) familyID=utils.get_individual_info(sourcesDirectory+filename, "ID_2") fatherID=utils.get_individual_info(sourcesDirectory+filename, "father") motherID=utils.get_individual_info(sourcesDirectory+filename, "mother")