LOG.info("Parsing VCF file...") # not all chromosomes/seqid will be processed if not in vcf file processed_seqids = OrderedDict() for seqid in tb.contigs: processed_seqids[seqid] = False left = VCFtoChainInfo() right = VCFtoChainInfo() chain_info = {} if diploid: left.output_file = g2g_fu.prepend_before_extension(output_file, 'left') right.output_file = g2g_fu.prepend_before_extension( output_file, 'right') chain_info['left'] = left chain_info['right'] = right g2g_fu.delete_file(left.output_file) g2g_fu.delete_file(right.output_file) else: left.output_file = output_file chain_info['left'] = left g2g_fu.delete_file(left.output_file) try: all_chrom = [c for c in fasta_file.references]
def prepare_fasta_patch(filename_fasta, filename_output, bgzip=False, diploid=False): """ Initialize fasta_patch variables :param filename_fasta: :param filename_vcf: :param strain: :param filename_output: :param bgzip: :param diploid: :return: """ filename_output = g2g_fu.check_file(filename_output, 'w') output_file_dir = os.path.abspath(os.path.dirname(filename_output)) new_filename_output = filename_output # let's figure out what our output names will be if filename_output.lower().endswith('.gz'): # strip off .gz new_filename_output = filename_output[:-3] if not filename_output.lower().endswith('.fa'): raise G2GValueError("Expecting output filename extension to be either '.fa.gz' or '.fa'") if diploid: filename_output_l = g2g_fu.prepend_before_extension(new_filename_output, 'l') filename_output_r = g2g_fu.prepend_before_extension(new_filename_output, 'r') g2g_fu.delete_index_files(filename_output_l) g2g_fu.delete_index_files(filename_output_r) else: filename_output_l = new_filename_output filename_output_r = None g2g_fu.delete_index_files(filename_output_l) # at this point we are hoping for a .fa extension # let's figure out our input and process accordingly if filename_fasta.lower().endswith('.fa.gz'): # decompress the fasta file if it is compressed LOG.info("Copying and decompressing fasta file") # copy file and preserve gz extension for bgzip -d to work tmp_file_name = os.path.basename(filename_fasta) # something.gz LOG.debug("tmp_file_name={0}".format(tmp_file_name)) tmp_fasta = os.path.join(output_file_dir, tmp_file_name) # /path/something.fa.gz LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("COPYING {0} to {1}".format(filename_fasta, tmp_fasta)) shutil.copy(filename_fasta, tmp_fasta) # cp /original/something.fa.gz /output/something.fa.gz LOG.debug("DECOMPRESSING {0}".format(tmp_fasta)) g2g_fu.bgzip_decompress(tmp_fasta) tmp_fasta = tmp_fasta[:-3] # /path/something.fa LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("Moving '{0}' to '{1}'...".format(tmp_fasta, filename_output_l)) shutil.move(tmp_fasta, filename_output_l) elif filename_fasta.lower().endswith('.fa'): LOG.debug("File is not compressed") LOG.debug("COPYING {0} to {1}".format(filename_fasta, filename_output_l)) shutil.copy(filename_fasta, filename_output_l) else: raise G2GValueError("Expecting input filename extension to be either '.fa.gz' or '.fa'") if diploid: LOG.debug("Copying '{0}' to '{1}'...".format(filename_output_l, filename_output_r)) shutil.copy(filename_output_l, filename_output_r) # build a temporary fasta index pysam.FastaFile(filename_output_l) return filename_output_l, filename_output_r
def prepare_fasta_patch(filename_fasta, filename_output, bgzip=False, diploid=False): """ Initialize fasta_patch variables :param filename_fasta: :param filename_vcf: :param strain: :param filename_output: :param bgzip: :param diploid: :return: """ filename_output = g2g_fu.check_file(filename_output, "w") output_file_dir = os.path.abspath(os.path.dirname(filename_output)) new_filename_output = filename_output # let's figure out what our output names will be if filename_output.lower().endswith(".gz"): # strip off .gz new_filename_output = filename_output[:-3] if not filename_output.lower().endswith(".fa"): raise G2GValueError("Expecting output filename extension to be either '.fa.gz' or '.fa'") if diploid: filename_output_l = g2g_fu.prepend_before_extension(new_filename_output, "l") filename_output_r = g2g_fu.prepend_before_extension(new_filename_output, "r") g2g_fu.delete_index_files(filename_output_l) g2g_fu.delete_index_files(filename_output_r) else: filename_output_l = new_filename_output filename_output_r = None g2g_fu.delete_index_files(filename_output_l) # at this point we are hoping for a .fa extension # let's figure out our input and process accordingly if filename_fasta.lower().endswith(".fa.gz"): # decompress the fasta file if it is compressed LOG.info("Copying and decompressing fasta file") # copy file and preserve gz extension for bgzip -d to work tmp_file_name = os.path.basename(filename_fasta) # something.gz LOG.debug("tmp_file_name={0}".format(tmp_file_name)) tmp_fasta = os.path.join(output_file_dir, tmp_file_name) # /path/something.fa.gz LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("COPYING {0} to {1}".format(filename_fasta, tmp_fasta)) shutil.copy(filename_fasta, tmp_fasta) # cp /original/something.fa.gz /output/something.fa.gz LOG.debug("DECOMPRESSING {0}".format(tmp_fasta)) g2g_fu.bgzip_decompress(tmp_fasta) tmp_fasta = tmp_fasta[:-3] # /path/something.fa LOG.debug("tmp_fasta={0}".format(tmp_fasta)) LOG.debug("Moving '{0}' to '{1}'...".format(tmp_fasta, filename_output_l)) shutil.move(tmp_fasta, filename_output_l) elif filename_fasta.lower().endswith(".fa"): LOG.debug("File is not compressed") LOG.debug("COPYING {0} to {1}".format(filename_fasta, filename_output_l)) shutil.copy(filename_fasta, filename_output_l) else: raise G2GValueError("Expecting input filename extension to be either '.fa.gz' or '.fa'") if diploid: LOG.debug("Copying '{0}' to '{1}'...".format(filename_output_l, filename_output_r)) shutil.copy(filename_output_l, filename_output_r) # build a temporary fasta index pysam.FastaFile(filename_output_l) return filename_output_l, filename_output_r
LOG.info("Parsing VCF file...") # not all chromosomes/seqid will be processed if not in vcf file processed_seqids = OrderedDict() for seqid in tb.contigs: processed_seqids[seqid] = False left = VCFtoChainInfo() right = VCFtoChainInfo() chain_info = {} if diploid: left.output_file = g2g_fu.prepend_before_extension(output_file, 'left') right.output_file = g2g_fu.prepend_before_extension(output_file, 'right') chain_info['left'] = left chain_info['right'] = right g2g_fu.delete_file(left.output_file) g2g_fu.delete_file(right.output_file) else: left.output_file = output_file chain_info['left'] = left g2g_fu.delete_file(left.output_file) try: all_chrom = [c for c in fasta_file.references] all_chrom_length = [n for n in fasta_file.lengths]