def create_merged_seq(self): """Writes merged target-ordered query sequence""" align = pd.read_table(os.path.join(self.out_dir, "longest_segments.txt")) align.sort(columns=["target_name", "target_final_start"], inplace=True) out = open(os.path.join(self.out_dir, "merged_seq.fa"), "w") curr_target_name = "" i = 0 start = 0 for row in align.iterrows(): query = row[1]['query_name'] target = row[1]['target_name'] fa = FastaHack(self.query_fas[query]) if curr_target_name != target: if i > 0: out.write("\n") i = 1 out.write(">{0}\n".format(target)) curr_target_name = target fasub = fa.get_sequence(query) if row[1]['query_strand'] == "-": fasub = futil.reverse_complement(fasub) fasub1, start = futil.format_fasta(fasub, start) [out.write(x) for x in fasub1] ns = "N" * 1000 ns1, start = futil.format_fasta(ns, start) [out.write(x) for x in ns1] out.write("\n")
class FastaTestMore(FastaHackTest): def setUp(self): self.fa = FastaHack(FA) def test_query(self): self.assertEqual(self.fa.get_sub_sequence("1", 0, 4), "TAACC") self.assertEqual(self.fa.get_sequence("1:1-5"), "TAACC") self.assertEqual(self.fa["1:1-5"], "TAACC") self.assertEqual(self.fa["1:4-5"], "CC")
def tab_to_vcf(input_file, output_file, reference_file, columns, info_fields, convert_iupac=False): """ Convert tab-delimited file to VCF. Support for the fixed VCF fields: #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO PyVCF's _Record class requires the following arguments: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes convert_iupac (bool) : When present, convert IUPAC codes to the non-reference allele. This is only possible for when the reference and IUPAC-determined alternates share at least one allele. Tri-allelic conversion is not supported and will emit a warning. IUPAC codes: http://www.bioinformatics.org/sms/iupac.html """ reference_dict = FastaHack(reference_file) with open(input_file, "r") as input_fh: reader = csv.DictReader(input_fh, delimiter="\t") with open(TEMPLATE_VCF_FILE, "r") as template_fh: vcf_reader = vcf.Reader(template_fh) with open(output_file, "w") as output_fh: vcf_writer = vcf.Writer(output_fh, vcf_reader, lineterminator='\n') for row in reader: args = [row.get(columns.get(f,None), ".") for f in VCF_COLUMN_ORDER] # Convert position to an integer. args[POSITION_INDEX] = int(args[POSITION_INDEX]) # Convert indels from GATK to VCF format. if args[ALT_INDEX].startswith(("+", "-")) and not "/" in args[ALT_INDEX]: args = gatk_indel_to_vcf(args, reference_dict) # Optionally convert IUPAC code if convert_iupac: args = _convert_iupac(args) # Convert alternate allele scalar to a list. args[ALT_INDEX] = [args[ALT_INDEX]] # Convert info fields if info_fields: INFO = {} for vcf_field,tab_field in info_fields.items(): if tab_field in row: INFO[vcf_field] = row[tab_field] else: INFO = {} # Add empty entries for INFO, FORMAT, and sample_indexes. args.extend([INFO, ".", []]) record = _Record(*args) vcf_writer.write_record(record)
def tab_to_vcf(input_file, output_file, reference_file): """ Convert tab-delimited file to VCF. Support for the fixed VCF fields: #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO PyVCF's _Record class requires the following arguments: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes """ reference_dict = FastaHack(reference_file) with open(input_file, "r") as input_fh: reader = csv.DictReader(input_fh, delimiter="\t") with open(TEMPLATE_VCF_FILE, "r") as template_fh: vcf_reader = vcf.Reader(template_fh) with open(output_file, "w") as output_fh: vcf_writer = vcf.Writer(output_fh, vcf_reader, lineterminator='\n') for row in reader: args = [ row.get(tab_field, ".") for vcf_field, tab_field in VCF_TO_FIELDS ] # Convert position to an integer. args[POSITION_INDEX] = int(args[POSITION_INDEX]) # Convert indels from GATK to VCF format. if args[ALT_INDEX].startswith( ("+", "-")) and not "/" in args[ALT_INDEX]: args = gatk_indel_to_vcf(args, reference_dict) # Convert alternate allele scalar to a list. args[ALT_INDEX] = [args[ALT_INDEX]] # Add empty entries for INFO, FORMAT, and sample_indexes. args.extend([{}, ".", []]) record = _Record(*args) vcf_writer.write_record(record)
def setUp(self): self.fa = FastaHack(FA)
if __name__=='__main__': opts = OptionParser() opts.add_option('','--contig_file',dest='fn_contig_file') opts.add_option('','--wnd_width',dest='wnd_width',default=None,type=int) opts.add_option('','--wnd_slide',dest='wnd_slide',default=None,type=int) opts.add_option('','--wnd_pickle',dest='wnd_pickle',default=None) opts.add_option('','--wnd_contig_file',dest='fn_wnd_contig_file') opts.add_option('','--fasta',dest='fn_fasta') opts.add_option('','--fn_out',dest='fn_out',default=None) opts.add_option('','--sunk_based',dest='sunk_based',action='store_true',default=False) (o, args) = opts.parse_args() print "loading fasta %s..."%(o.fn_fasta) fa = FastaHack(o.fn_fasta) print>>stderr, "%s"%(o.fn_out) out_wnd_DTS = DenseTrackSet(o.fn_wnd_contig_file, "%s"%(o.fn_out), overwrite=True, openMode='w') out_wnd_DTS.addGroup("GC") out_wnd_DTS.addGroup("starts") out_wnd_DTS.addGroup("ends") out_wnd_DTS['GC'].addArray(tables.Float32Atom(),[]) out_wnd_DTS['starts'].addArray(tables.UInt32Atom(),[]) out_wnd_DTS['ends'].addArray(tables.UInt32Atom(),[]) ###WE ONLY NEED THE STARTS because start[k],start[k+1] == start[k], end[k]