def test_deletion_insertion_mismatch(self):
        """ Compute the correct MD tag for a spliced transcript that contains an
            insertion, deletion, and mismatch. """

        sam = "input_files/sams/deletion_insertion_mismatch.sam"
        genome = Fasta("input_files/hg38_chr1.fa")

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, {})

        correct_MD = "MD:Z:475C0A0C0C0A1082^C1347G17C205"
        correct_NM = "NM:i:9"
        assert transcript.MD == correct_MD
        assert transcript.NM == correct_NM
    def test_perfect_match_with_introns(self):
        """ Compute the correct MD tag for a transcript that is a perfect
            reference match containing introns. """

        sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam"
        genome = Fasta("input_files/hg38_chr1.fa")

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, {})

        correct_MD = "MD:Z:3400"
        correct_NM = "NM:i:0"
        assert transcript.MD == correct_MD
        assert transcript.NM == correct_NM
    def test_insertion(self):
        """ Compute the correct MD tag for a spliced transcript that contains an
            insertion. """

        sam = "input_files/sams/insertion.sam"
        genome = Fasta("input_files/hg38_chr1.fa")

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, {})

        correct_MD = "MD:Z:3069"
        correct_NM = "NM:i:2"
        assert transcript.MD == correct_MD
        assert transcript.NM == correct_NM
    def test_insertion_deletion_mismatch_ncsj(self):
        """ Compute the correct MD tag for a transcript that contains an 
           insertion, deletion, mismatch, and noncanonical splice junction in 
           it. """

        sam = "input_files/sams/deletion_insertion_mismatch_nc.sam"
        genome = Fasta("input_files/hg38_chr1.fa")

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, {})

        correct_MD = "MD:Z:414G0A450^C405^C2435"
        correct_NM = "NM:i:5"
        assert transcript.MD == correct_MD
        assert transcript.NM == correct_NM
Example #5
0
def get_seq(random_cns_list, fasta_file, new_fasta_name):
    new_fasta = open("{0}".format(new_fasta_name), "wb")
    n = 0
    for random_cns in random_cns_list:
        n = n + 1
        accn, seqid, start, end = random_cns
        f = Fasta(fasta_file)
        seq = f[seqid][start:end]
        if "X" in seq:
            print accn, seqid, start, end
        if len(seq) < 15 and len(seq) > 0:
            print "OH NO!!!!!!"
        w = ">cns{0}\n".format(n)
        seq_w = "{0}\n".format(seq)
        new_fasta.write(w)
        new_fasta.write(seq_w)
Example #6
0
def ExtractSg(str_refgem, str_chr, int_beg, int_end, str_drct):
    fa_ref = Fasta(str_refgem)
    re_pat = re.compile('G+')
    gn_f_sg = __ParseSeq(str_chr, int_beg, int_end, '+', fa_ref, re_pat)
    gn_r_sg = __ParseSeq(str_chr, int_beg, int_end, '-', fa_ref, re_pat)
    if str_drct == 'f':
        for str_sg in gn_f_sg:
            yield str_sg
    elif str_drct == 'r':
        for str_sg in gn_r_sg:
            yield str_sg
    elif str_drct == 'b':
        for str_sg in gn_f_sg:
            yield str_sg
        for str_sg in gn_r_sg:
            yield str_sg
Example #7
0
def find_repeats(fastafile,outfile):
    f = Fasta(fastafile)
    d = defaultdict(list)
    out = open(outfile,"wb")
    for chrm in f:
        chrm_str = f[chrm][:]
        ### match as many repeats get as many as possible lower or upper
        #repeat_reagions = re.finditer("X{1,}",chrm_str, flags=re.IGNORECASE)
        repeat_reagions = re.finditer("[a-z]{1,}",chrm_str)
        rep_pos = [repeat.span() for repeat in repeat_reagions]
        for rep in rep_pos:
            d[chrm].append(rep)
            name = "{0}_{1}_{2}".format(chrm,rep[0], rep[1])
            out.write("{0}\t{1}\t{2}\t{3}\n".format(chrm,rep[0],rep[1],name))
    out.close()
    return d
Example #8
0
    def search(self, gene, ref, pos, alt):
        mut_name = "".join([ref, str(pos), alt])
        gene_mut_name = "_".join([gene, mut_name])

        fasta_string = self.create_variant_probe_set(var_name=gene_mut_name)
        with tempfile.NamedTemporaryFile() as fp:
            fp.write(fasta_string)
            fp.seek(0)
            fasta = Fasta(fp.name)
        refs = []
        alts = []
        for k, v in fasta.items():
            if "ref" in k:
                refs.append(str(v))
            else:
                alts.append(str(v))
        return {"query": gene_mut_name, "results": self.genotype_alleles(refs, alts)}
Example #9
0
def main(gff_file, outdir):
    """empty docstring"""
    name = re.compile("parent=([^.;]+)", re.I)

    feats = {}
    non_cds_feats = collections.defaultdict(list)
    for line in open(gff_file):
        line = line.split("\t")
        match = re.search(name, line[-1])
        if not match: continue
        fname = match.groups(0)[0]
        non_cds_feats[fname].append(line)
        if line[2].upper() == 'CDS':
            feats[fname] = True
            continue
        if fname in feats: continue
        feats[fname] = None
    i = 0
    for k, v in sorted(feats.items()):
        if not v is None: del non_cds_feats[k]

    seen = {}
    RNA = open(outdir + '/at_non_cds.gff', 'w')
    for k, feat_list in sorted(non_cds_feats.items()):
        for feat in feat_list:
            if feat[0] in ('ChrC', 'ChrM'): continue
            if feat[2] == 'exon': continue
            key = (feat[0], feat[3], feat[4])
            if key in seen: continue
            feat[0] = feat[0].upper().replace('CHR', '')
            seen[key] = True
            feat[-1] = k
            print >> RNA, "\t".join(feat)
    RNA.close()

    gff = read_gff(outdir + '/at_non_cds.gff')
    fasta = Fasta(
        '/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta')
    ftypes = {}
    FA = open(outdir + '/at_rnas.fasta', 'w')
    for chr, feature_list in gff.iteritems():
        for fname, feature in feature_list.iteritems():
            seq = fasta.sequence(feature)
            print >> FA, ">", feature['name']
            print >> FA, seq
    FA.close()
Example #10
0
def intron(fa, ann):
    f = Fasta(fa)
    fh = open(ann, 'r')
    out1 = open('intron.fa', 'w')
    mdict = {}
    ndict = {}
    for line in fh:
        if line.startswith('#'):
            continue
        new = line.strip().split('\t')
        if new[2] != 'CDS':
            continue
        n = new[-1].split(';')
        for i, j in enumerate(n):
            if 'Parent=' in j:
                mindex = i
        g = n[mindex].split('.')
        t = g[0].replace('Parent=', '')
        if '_' in t:
            gene = t.split('_')[0]
        else:
            gene = t
        if gene not in mdict:
            mdict[gene] = []
            ndict[gene] = [new[0], new[6]]
        start1 = int(new[3])
        stop1 = int(new[4])
        mdict[gene].append((start1, stop1))

    for i in sorted(mdict):
        k = ''
        total = len(mdict[i])
        for j in range(0, total - 1):
            start = mdict[i][j][1] + 1
            stop = mdict[i][j + 1][0] - 1
            k1 = f.sequence({
                'chr': ndict[i][0],
                'start': start,
                'stop': stop,
                'strand': ndict[i][1]
            })
            k += k1
        out1.write('>{0}-intron'.format(i) + '\n')
        out1.write(k + '\n')
    fh.close()
    out1.close()
Example #11
0
def exclude_genes_in_high_repeat_areas(merged_genes, bfasta):
    #print "FASTA:", afasta

    f = Fasta(bfasta)
    skipped = 0
    for gene in merged_genes:
        # get the total sequence length.
        seq = str(f[gene['seqid']][gene['start']:gene['end']])
        tot = len(seq)
        # and the lenght of real sequence.
        seq = seq.upper().replace('N', '').replace('X', '')
        # if it's not > 80% good sequence, just skip it.
        if float(len(seq)) / tot < .85:
            skipped += 1
            continue
        yield gene
    log.info("removed %i (otherwise) new genes in masked areas" % skipped)
Example #12
0
def run(args):
    genome = Fasta(args.genome)
    bed = filter(lambda x: x.strip(), args.bedfile.readlines())
    bed_list = map(lambda x: x.strip().split(), bed)
    result = map(
        lambda i: '>{0}_{1}\n{2}'.format(
            args.seqname, i + 1,
            genome.sequence({
                'chr': bed_list[i][0],
                'start': int(bed_list[i][1]) - args.flank,
                'stop': int(bed_list[i][2]) + args.flank,
                'strand': bed_list[i][3]
            })).upper(), range(len(bed_list)))
    if args.outfile:
        args.outfile.write('\n'.join(result))
    else:
        print ''.join(result)
Example #13
0
def main():
    """
    select specific contigs from FASTA file
    """
    if len(sys.argv) == 2:
        prefix = sys.argv[1]
    else:
        print "Usage: python select.py <prefix>; assume that <prefix>_BspQI_key.txt <prefix>.fasta and <prefix>_list.txt exist; output will be <prefix>_selected.fasta"
        return 0
     
    ren = ReadTable(prefix+'_BspQI_key.txt', 4, '\t') # 4 lines of header 
    print 'renaming table',ren
    select = ReadTable(prefix+'_list.txt', 0) # no header, text file of contigs numbers, one per line
    print 'select list',select

    # create a dictionary between contig id x[0] and (FASTA id x[1])
    renaming = {}
    for x in ren:
        renaming[int(x[0])]=x[1] # contigs names are converted into integers, as well as length
    print 'renaming dictionary', renaming
  
    # collect the names of the contigs to be cut 
    selected_list = []
    for x in select:
        index = int(x[0]) # name of the contig to select, convert contig name into integer so we can match it
        #print 'index',index
        if index in renaming:
           selected_list.append(renaming[index]) # add the name of the contig
        else: 
           print 'Error: contig',index,'does not exist'
           sys.exit(-1)
    print 'selected_list', selected_list
 
    # open the fasta file for reading
    fas = Fasta(prefix+'.fasta')
    # open the new fasta file for writing
    ofa = open(prefix+'_new.fasta','w')
    print 'writing new fasta'
    for x in sorted(fas.keys()): # process all the contigs one by one
        if x in selected_list: # if it needs to be split
            print 'Selecting',x
            ofa.write('>'+x+'\n')
            ofa.write(fas[x][:]+'\n') # entire contig
        else: 
            print 'Not selecting',x
    ofa.close()
Example #14
0
def build_gc_array(fastafile="/mnt/ref/hg38.upper.fa", gcdir="gc", n=1000):
    from pyfasta import Fasta
    f = Fasta(fastafile)
    mkdir(gcdir)
    for seqid in allsomes:
        if seqid not in f:
            logging.debug("Seq {} not found. Continue anyway.".format(seqid))
            continue
        c = np.array(f[seqid])
        gc = (c == 'G') | (c == 'C')  # If base is GC
        rr = ~(c == 'N')  # If base is real
        mgc = pd.rolling_sum(gc, n, min_periods=n / 2)[n - 1::n]
        mrr = pd.rolling_sum(rr, n, min_periods=n / 2)[n - 1::n]
        gc_pct = np.rint(mgc * 100 / mrr)
        gc_pct = np.asarray(gc_pct, dtype=np.uint8)
        arfile = op.join(gcdir, "{}.{}.gc".format(seqid, n))
        gc_pct.tofile(arfile)
        print >> sys.stderr, seqid, gc_pct, arfile
    def test_two_annotated_SJs(self):
        """ Transcript with 2 junctions and each match the provided reference
        """
        sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam"
        genome = Fasta("input_files/hg38_chr1.fa")
        sjFile = "input_files/GM12878_SJs_chr1.tab"
        outprefix = "scratch/test"
        tmp_dir = "scratch/test_jIjM/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir,
                                                               chroms)

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, sjDict)

        assert transcript.allJnsAnnotated == True
        assert transcript.isCanonical == True
    def test_plus_strand(self):
        """ Toy transcript with sequence AAAGA on the plus strand. Sequence
            should be output as listed in the SAM fields."""

        sam_fields = ["test_read", "0", "chr1", "202892094", "255", "5M", "*",
                      "0", "0", "AAAGA", "*",	"NM:i:0", "MD:Z:5", "jI:B:i,-1",
                      "jM:B:c,-1" ]

        genome = Fasta("input_files/hg38_chr1.fa")
        spliceAnnot = None

        # Init transcript object
        transcript = ts.Transcript(sam_fields, genome, spliceAnnot)

        # Output fasta and check against expected
        expected_fasta = ">test_read" + "\n" + "AAAGA"

        assert transcript.printableFa() == expected_fasta
Example #17
0
    def test_pre_correction_dmel(self):
        """ This is a noisy Drosophila read, but prior to correction, the CIGAR
            and SEQ strings should definitely match """

        sam = "input_files/drosophila_example/input_read.sam"
        genome = Fasta("input_files/drosophila_example/chr3R.fa")
        sjDict = set()

        with open(sam, 'r') as f:
            for sam_line in f:
                if sam_line.startswith("@"):
                    continue
                else:
                    sam_line = sam_line.strip().split('\t')
                    transcript = t2.Transcript(sam_line, genome, sjDict)

        assert t2.check_seq_and_cigar_length(transcript.SEQ,
                                             transcript.CIGAR) == True
Example #18
0
def build(args):

    reference = Fasta(
        '/pipeline/data/b37/VEP/homo_sapiens/75/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa'
    )

    with open(args[1], 'r') as ref_file:
        for line in ref_file:
            if line.startswith('#'):
                continue
            fields = line.split('\t')
            chrom = key_map[fields[1]]
            position = int(fields[2])

            ref_allele = reference[chrom][position]

            sys.stdout.write("{}\n".format("\t".join(
                [fields[1], fields[2], ref_allele])))
Example #19
0
def bed_to_introns(bed_in, fasta_in, fasta_out):
    logging.info("Opening FASTA: {0}".format(fasta_in))
    logging.info("Note: will take a while the first time it is opened.")
    fasta = Fasta(fasta_in, key_fn=lambda key: key.split()[0])

    bed_h = open(bed_in, 'r')

    all_keys = {}
    output_seq = []
    count = 0
    for line in bed_h:
        if count % 10000 == 0 and count > 0:
            logging.info("On intron: {0}".format(count))
        ref, start, stop, genename = line.split()
        key = ref + ':' + start + '-' + stop
        if key in all_keys:
            logging.warning("ERROR: {0} appears once already".format(key))
            continue
        all_keys[key] = True
        if int(start) > int(stop):
            logging.warning(
                "Intron coords greater than reference: {0}:{1}-{2}".format(
                    ref, start, stop))
            logging.warning("Reference length: {0}".format(len(fasta[ref])))
            continue
        if not ref in fasta:
            logging.warning(
                "Skipping {0} due to missing sequence in the FASTA".format(
                    key))
            continue
        seq = fasta[ref][int(start):int(stop)]
        if len(seq) == 0:
            logging.warning("Intron length is 0? {0}:{1}-{2}".format(
                ref, start, stop))
            continue
        record = SeqRecord(Seq(seq), key, '', '')
        output_seq.append(record)
        count += 1

    bed_h.close()

    with open(fasta_out, 'w') as outf:
        logging.info("Writing intron sequences out to {0}".format(fasta_out))
        SeqIO.write(output_seq, outf, 'fasta')
    def test_minus_strand(self):
        """ Toy transcript on the minus strand with SAM sequence AAAGA.
            Sequence for FASTA file should be the reverse complement.
        """

        sam_fields = ["test_read", "16", "chr1", "202892094", "255", "5M", "*",
                      "0", "0", "AAAGA", "*",   "NM:i:0", "MD:Z:5", "jI:B:i,-1",
                      "jM:B:c,-1" ]

        genome = Fasta("input_files/hg38_chr1.fa")
        spliceAnnot = None

        # Init transcript object
        transcript = ts.Transcript(sam_fields, genome, spliceAnnot)

        # Output fasta and check against expected
        expected_fasta = ">test_read" + "\n" + "TCTTT"

        assert transcript.printableFa() == expected_fasta
Example #21
0
def get_sketch(fasta, n_kmers=100, k=15):
    # use a sample of kmers from a fastq
    hash_count = Counter()
    f = Fasta(fasta)
    for chrom in f.keys():
        seq = f[chrom]
        for i in range(len(seq) - k):
            kmer = seq[i:i + k]
            hash_count[kmer] += 1

    hashes_used = 0
    hashed_sketch = []
    for kmer in sorted(hash_count.keys()):
        if hashes_used <= n_kmers:
            #print(hash_count[i])
            hashed_sketch.append(kmer)
            hashes_used += 1

    return hashed_sketch
Example #22
0
def generate_corpusfile(fasta_fname, n, corpus_fname):
    '''
    Args:
        fasta_fname: corpus file name
        n: the number of chunks to split. In other words, "n" for "n-gram"
        corpus_fname: corpus_fnameput corpus file path
    Description:
        Protvec uses word2vec inside, and it requires to load corpus file
        to generate corpus.
    '''
    f = open(corpus_fname, "w")
    fasta = Fasta(fasta_fname)
    for record_id in tqdm(fasta.keys(), desc='corpus generation progress'):
        r = fasta[record_id]
        seq = str(r)
        ngram_patterns = split_ngrams(seq, n)
        for ngram_pattern in ngram_patterns:
            f.write(" ".join(ngram_pattern) + "\n")
    f.close()
def extract_reference_allele():
    print "Extracting Reference Allele from Reference Fasta file - %s to REF\n" % args.reference
    # Get reference genome ID from reference fasta file
    get_reference = Fasta(args.reference)
    if len(get_reference.keys()) == 1:
        ref_id = get_reference.keys()
    print "The reference genome ID from reference genome - %s" % ref_id

    fileObj = open("REF", 'w+')
    fileObj.write('Ref' + '\n')
    for item in pos:
        ref_allele = str(
            get_reference.sequence({
                'chr': str(get_reference.keys()[0]),
                'start': int(item),
                'stop': int(item)
            }))
        fileObj.write(ref_allele + '\n')
    fileObj.close()
Example #24
0
    def test_not_matching(self):
        """ This is the sam read as above but after correction with TCv2.0.1.
            The read got flagged by samtools after correction as having 
            inconsistent CIGAR and SEQ fields """

        sam = "input_files/drosophila_example/bad_correction.sam"
        genome = Fasta("input_files/drosophila_example/chr3R.fa")
        sjDict = set()

        with open(sam, 'r') as f:
            for sam_line in f:
                if sam_line.startswith("@"):
                    continue
                else:
                    sam_line = sam_line.strip().split('\t')
                    transcript = t2.Transcript(sam_line, genome, sjDict)

        assert t2.check_seq_and_cigar_length(transcript.SEQ,
                                             transcript.CIGAR) == False
Example #25
0
    def __init__(self, cellline_trn_num,path='/home/liuqiao/software/DeepCAGE/data/encode'):
        self.path = path
        self.genome = Fasta('%s/genome.fa'%path)
        pd_openness = pd.read_csv('%s/readscount_normalized_filtered.csv'%path,header=0,index_col=[0],sep='\t')
        self.pd_openness = np.log(pd_openness+1)

        pd_tf_gexp = pd.read_csv('%s/preprocessed/tf_gexp.csv'%path ,sep='\t',header=0,index_col=[0])
        self.pd_tf_gexp = np.log(pd_tf_gexp+1)

        self.pd_tf_bs = pd.read_csv('%s/preprocessed/tf_motif_score.csv'%path,sep='\t',header=0,index_col=[0])
        np.random.seed(123)
        train_idx = np.random.choice(np.arange(self.pd_openness.shape[1]),size=cellline_trn_num,replace=False)
        self.train_dseq_celllines = np.array(list(self.pd_openness.columns))[train_idx]
        self.test_dseq_celllines = [item for item in self.pd_openness.columns if item not in self.train_dseq_celllines]

        self.train_rseq_celllines = np.array(list(self.pd_tf_gexp.index))[train_idx]
        self.test_rseq_celllines = [item for item in self.pd_tf_gexp.index if item not in self.train_rseq_celllines]
        assert self.pd_openness.shape[1] == len(self.train_dseq_celllines)+len(self.test_dseq_celllines)
        assert self.pd_tf_gexp.shape[0] == len(self.train_rseq_celllines)+len(self.test_rseq_celllines)
Example #26
0
    def __init__(self):
        self.datapath = '../data/'
        self.dna_peak_c_path = 'dnase_peaks_conservative/'
        self.label_path = os.path.join(self.datapath, 'chipseq_labels/')
        self.benchmark_path = os.path.join(self.datapath, 'benchmark_labels/')
        self.hg19 = Fasta(os.path.join(self.datapath, 'annotations/hg19.genome.fa'))
        self.bin_size = 200
        self.correction = 400

        self.train_length = 51676736
        self.ladder_length = 8843011
        self.test_length = 60519747
        self.chunk_size = 1000000
        self.num_channels = 4
        self.num_trans_fs = len(self.get_trans_fs())
        self.num_celltypes = len(self.get_celltypes())
        self.save_dir = os.path.join(self.datapath, 'preprocess/features')
        if not os.path.exists(self.save_dir):
            os.mkdir(self.save_dir)
Example #27
0
def write_files(original_fasta, out_dir, counts, write_bin):
    fa = Fasta(original_fasta)
    fc, ft, fmethyltype = \
            bin_paths_from_fasta(original_fasta, out_dir)
    out = open(op.dirname(fmethyltype) + "/methyl-data-%s.txt" \
                    % datetime.date.today(), 'w')
    print >> out, make_header()
    print >> out, "#seqid\tmt\tbp\tc\tt"

    f_pat = bin_paths_from_fasta(original_fasta, out_dir, pattern_only=True)
    f_summary = open(op.join(out_dir, "summary.txt"), "w")
    print >> sys.stderr, "#> writing:", f_pat, f_summary.name

    summary_counts = dict.fromkeys(('CHG', 'CHH', 'CG'))
    for ctx in summary_counts.keys():
        summary_counts[ctx] = {'cs': 0, 'ts': 0}

    for i, seqid in enumerate(sorted(counts.keys())):

        seq = str(fa[seqid])
        mtype = calc_methylation(seq)
        cs = counts[seqid]['c']
        ts = counts[seqid]['t']

        print_summary(seqid,
                      cs,
                      ts,
                      mtype,
                      summary_counts,
                      f_summary,
                      print_header=(i == 0))

        if write_bin:
            cs.tofile(fc % seqid)
            ts.tofile(ft % seqid)
            mtype.tofile(fmethyltype % seqid)

        to_text_file(cs, ts, mtype, seqid, out)
        del cs
        del ts
        del mtype

    print_genome_summary(summary_counts, f_summary)
    def test_crash_correction(self):
        """ This is a case that is supposed to crash the NCSJ correction process,
           resulting in no correction. This is because the mapping has
           created a 7-bp micro-exon with a canonical but likely incorrect
           junction to its left, and a non-canonical junction on its right.
           Post-correction, we end up with two introns next to each other
           with a zero-length exon, which is not valid."""

        # Process references
        sjFile = "input_files/chr11_sjs.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr11"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/hg38_chr11.fa")

        sam = "input_files/sams/microexon.sam"
        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')

        # Init transcript object
        transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot)
        maxDist = 5
        logInfo = TC.init_log_info(sam_line)

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        transcript, TE_entries = TC.cleanNoncanonical(transcript, refs,
                                                      maxDist, logInfo)

        orig_CIGAR = ("1211M5612N57M464N30M2717N120M1097N23M2632N146M1225N"
                      "140M4770N72M5051N132M1513N87M567N142M3780N100M2160N"
                      "59M864N31M9891N69M1711N7M1341N47M13S")

        assert transcript.isCanonical == False
        assert transcript.MD == "MD:Z:2473"
        assert logInfo.corrected_NC_SJs == 0
        assert logInfo.uncorrected_NC_SJs == 1
        assert transcript.CIGAR == orig_CIGAR
def parse_map(name, reads):
    f = Fasta("/hive/groups/recon/projs/gorilla_eichler/pipeline_data/assemblies/susie_3_2/human.fa")
    start, stop = reads[0].pos - padding, reads[-1].pos + padding
    seq = f['chr1'][start:stop]
    tmp_ref = tempfile.mkstemp()[1]
    with open(tmp_ref, "w") as outf:
        outf.write(">chr1_{}-{}\n{}\n".format(start, stop, seq))
    tmp_fq = tempfile.mkstemp()[1]
    with open(tmp_fq, "w") as outf:
        for i, read in enumerate(reads):
            outf.write("@{}\n{}\n+\n{}\n".format(read.qname + str(i), read.seq, read.qual))
    subprocess.call("/cluster/home/ifiddes/fermikit/fermi.kit/fermi2.pl unitig -s {}k -l100 -p {} {} > {}".format((stop - start) / 1000, tmp_fq, tmp_fq, tmp_fq + ".mak"), shell=True)
    subprocess.call("make -f {}".format(tmp_fq + ".mak"), shell=True)
    subprocess.call("bwa index {}".format(tmp_ref), shell=True)
    subprocess.call("/cluster/home/ifiddes/fermikit/fermi.kit/run-calling {} {} | sh".format(tmp_ref, tmp_fq + ".mag.gz"), shell=True)
    header = {"HD": {"VN": "1.3"}, "SQ": [{"LN": 248956422, "SN": "chr1"}]}
    with pysam.Samfile(os.path.join("/hive/users/ifiddes/longranger-1.2.0/separated_phased_bams/fermi-assembly_NA12878", name + ".bam"), "wb", header=header) as outf:
        for read in pysam.Samfile(tmp_fq + ".srt.bam"):
            read.pos = read.pos + start
            outf.write(read)
def updownTSS_seq(type):
    for p in db.features_of_type(type):
        genomefa = Fasta(myFasta)
        chrlen = len(genomefa[p.chrom])
        #print('>' + p.id + "_[-" + str(upRange) + "]-[+" + str(downRange) + "]")
        seqstart = p.start - 1 - upRange
        seqstart = seqstart if seqstart > 0 else 0  # avoid start with minus coord
        seqend = p.start + downRange
        # get sequence based on coordinates (start is 0-based)
        p_updown = genomefa[p.seqid][seqstart:seqend]
        if p.strand == '-':
            seqstart = p.end - 1 - downRange
            seqend = p.end + upRange
            seqend = seqend if seqend < chrlen else chrlen  # avoid end coord exceeds chrom end
            p_updown = genomefa[p.seqid][seqstart:seqend]
            p_updown = Seq(p_updown).reverse_complement()
        print('>' + p.id + "_[" + str(seqstart) + "-" + str(seqend) +
              "]")  # print out the coordinates
        for i in range(0, len(p_updown), 60):  # print 60 bases per line
            print(p_updown[i:i + 60])