def create_merged_seq(self):
        """Writes merged target-ordered query sequence"""
        align = pd.read_table(os.path.join(self.out_dir, "longest_segments.txt"))
        align.sort(columns=["target_name", "target_final_start"], inplace=True)

        out = open(os.path.join(self.out_dir, "merged_seq.fa"), "w")

        curr_target_name = ""
        i = 0
        start = 0
        for row in align.iterrows():
            query = row[1]['query_name']
            target = row[1]['target_name']
            fa = FastaHack(self.query_fas[query])

            if curr_target_name != target:
                if i > 0:
                    out.write("\n")
                    i = 1
                out.write(">{0}\n".format(target))
                curr_target_name = target

            fasub = fa.get_sequence(query)
            if row[1]['query_strand'] == "-":
                fasub = futil.reverse_complement(fasub)
            fasub1, start = futil.format_fasta(fasub, start)
            [out.write(x) for x in fasub1]

            ns = "N" * 1000
            ns1, start = futil.format_fasta(ns, start)
            [out.write(x) for x in ns1]

        out.write("\n")
Beispiel #2
0
class FastaTestMore(FastaHackTest):
    def setUp(self):
        self.fa = FastaHack(FA)

    def test_query(self):
        self.assertEqual(self.fa.get_sub_sequence("1", 0, 4), "TAACC")
        self.assertEqual(self.fa.get_sequence("1:1-5"), "TAACC")
        self.assertEqual(self.fa["1:1-5"], "TAACC")
        self.assertEqual(self.fa["1:4-5"], "CC")
Beispiel #3
0
def tab_to_vcf(input_file, output_file, reference_file, columns, info_fields, convert_iupac=False):
    """
    Convert tab-delimited file to VCF.

    Support for the fixed VCF fields: #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO

    PyVCF's _Record class requires the following arguments:

    CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes

    convert_iupac (bool) : When present, convert IUPAC codes to the non-reference allele.
        This is only possible for when the reference and IUPAC-determined alternates share 
        at least one allele. Tri-allelic conversion is not supported and will emit a warning.
        IUPAC codes: http://www.bioinformatics.org/sms/iupac.html
    """
    reference_dict = FastaHack(reference_file)

    with open(input_file, "r") as input_fh:
        reader = csv.DictReader(input_fh, delimiter="\t")

        with open(TEMPLATE_VCF_FILE, "r") as template_fh:
            vcf_reader = vcf.Reader(template_fh)

            with open(output_file, "w") as output_fh:
                vcf_writer = vcf.Writer(output_fh, vcf_reader, lineterminator='\n')

                for row in reader:
                    
                    args = [row.get(columns.get(f,None), ".") for f in VCF_COLUMN_ORDER]
                    # Convert position to an integer.
                    args[POSITION_INDEX] = int(args[POSITION_INDEX])

                    # Convert indels from GATK to VCF format.
                    if args[ALT_INDEX].startswith(("+", "-")) and not "/" in args[ALT_INDEX]:
                        args = gatk_indel_to_vcf(args, reference_dict)

                    # Optionally convert IUPAC code
                    if convert_iupac:
                        args = _convert_iupac(args)

                    # Convert alternate allele scalar to a list.
                    args[ALT_INDEX] = [args[ALT_INDEX]]

                    # Convert info fields
                    if info_fields:
                        INFO = {}
                        for vcf_field,tab_field in info_fields.items():
                            if tab_field in row:
                                INFO[vcf_field] = row[tab_field]
                    else:
                        INFO = {}
                    # Add empty entries for INFO, FORMAT, and sample_indexes.
                    args.extend([INFO, ".", []])

                    record = _Record(*args)
                    vcf_writer.write_record(record)
Beispiel #4
0
def tab_to_vcf(input_file, output_file, reference_file):
    """
    Convert tab-delimited file to VCF.

    Support for the fixed VCF fields: #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO

    PyVCF's _Record class requires the following arguments:

    CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes
    """
    reference_dict = FastaHack(reference_file)

    with open(input_file, "r") as input_fh:
        reader = csv.DictReader(input_fh, delimiter="\t")

        with open(TEMPLATE_VCF_FILE, "r") as template_fh:
            vcf_reader = vcf.Reader(template_fh)

            with open(output_file, "w") as output_fh:
                vcf_writer = vcf.Writer(output_fh,
                                        vcf_reader,
                                        lineterminator='\n')

                for row in reader:
                    args = [
                        row.get(tab_field, ".")
                        for vcf_field, tab_field in VCF_TO_FIELDS
                    ]

                    # Convert position to an integer.
                    args[POSITION_INDEX] = int(args[POSITION_INDEX])

                    # Convert indels from GATK to VCF format.
                    if args[ALT_INDEX].startswith(
                        ("+", "-")) and not "/" in args[ALT_INDEX]:
                        args = gatk_indel_to_vcf(args, reference_dict)

                    # Convert alternate allele scalar to a list.
                    args[ALT_INDEX] = [args[ALT_INDEX]]

                    # Add empty entries for INFO, FORMAT, and sample_indexes.
                    args.extend([{}, ".", []])

                    record = _Record(*args)
                    vcf_writer.write_record(record)
Beispiel #5
0
 def setUp(self):
     self.fa = FastaHack(FA)
if __name__=='__main__':
    opts = OptionParser()
    opts.add_option('','--contig_file',dest='fn_contig_file')
    opts.add_option('','--wnd_width',dest='wnd_width',default=None,type=int)
    opts.add_option('','--wnd_slide',dest='wnd_slide',default=None,type=int)
    opts.add_option('','--wnd_pickle',dest='wnd_pickle',default=None)
    opts.add_option('','--wnd_contig_file',dest='fn_wnd_contig_file')
    opts.add_option('','--fasta',dest='fn_fasta')
    opts.add_option('','--fn_out',dest='fn_out',default=None)
    opts.add_option('','--sunk_based',dest='sunk_based',action='store_true',default=False)
    
    (o, args) = opts.parse_args()
    

    print "loading fasta %s..."%(o.fn_fasta)
    fa = FastaHack(o.fn_fasta)

    print>>stderr, "%s"%(o.fn_out)    
    out_wnd_DTS = DenseTrackSet(o.fn_wnd_contig_file,
                                            "%s"%(o.fn_out),
                                            overwrite=True,
                                            openMode='w')
    
    out_wnd_DTS.addGroup("GC")
    out_wnd_DTS.addGroup("starts")
    out_wnd_DTS.addGroup("ends")
    out_wnd_DTS['GC'].addArray(tables.Float32Atom(),[])
    out_wnd_DTS['starts'].addArray(tables.UInt32Atom(),[])
    out_wnd_DTS['ends'].addArray(tables.UInt32Atom(),[])
    ###WE ONLY NEED THE STARTS because start[k],start[k+1] == start[k], end[k]