Exemple #1
0
def RunTalesfTask(options):
    
    logger = create_logger(options.logFilepath)
    
    logger("Beginning")
    
    if options.revcomp:
        forwardOnly = False
    else:
        forwardOnly = True
    
    if options.ncbi != "NA":
        logger("Retrieving NCBI sequence. This could take a while if this sequence hasn't been used recently and needs to be downloaded from NCBI.")
    
    with Conditional(options.ncbi != "NA", CachedEntrezFile(logger, options.ncbi)) as maybe_entrez_file:
        
        if options.ncbi != "NA":
            # Validate downloaded sequence
            check_fasta_pasta(maybe_entrez_file.file)
        
        if options.ncbi != "NA":
            seqFilename = maybe_entrez_file.filepath
        elif options.genome:
            seqFilename = GENOME_FILE % options.organism
        elif options.promoterome:
            seqFilename = PROMOTEROME_FILE % options.organism
        else:
            seqFilename = options.fasta
        
        result = ScoreTalesfTask(seqFilename, options.rvdString, options.outputFilepath, options.logFilepath, forwardOnly, options.cupstream, options.cutoff, 4, options.organism if options.genome else "")
        
        if(result == 1):
            raise TaskError()
def RunPairedTalesfTask(options):
    
    logger = create_logger(options.logFilepath)
    
    logger("Beginning")
    
    if options.genome:
        seqFilename = GENOME_FILE % options.organism
    elif options.promoterome:
        seqFilename = PROMOTEROME_FILE % options.organism
    else:
        seqFilename = options.fasta
    
    result = ScorePairedTalesfTask(seqFilename, options.rvdString, options.rvdString2, options.outputFilepath, options.logFilepath, options.cupstream, options.cutoff, options.min, options.max, 4, options.organism if options.genome else "")
    
    if(result == 1):
        raise TaskError()
Exemple #3
0
def RunTalesfTask(options):
    
    logger = create_logger(options.logFilepath)
    
    logger("Beginning")
    
    if options.revcomp:
        forwardOnly = False
    else:
        forwardOnly = True
    
    if options.genome:
        seqFilename = GENOME_FILE % options.organism
    elif options.promoterome:
        seqFilename = PROMOTEROME_FILE % options.organism
    else:
        seqFilename = options.fasta
    
    result = ScoreTalesfTask(seqFilename, options.rvdString, options.outputFilepath, options.logFilepath, forwardOnly, options.cupstream, options.cutoff, 4, options.organism if options.genome else "")
    
    if(result == 1):
        raise TaskError()
Exemple #4
0
    parser.add_option('-z', '--nodeid', dest='nodeID', type='int', default = '-1', help='Drupal node ID')
    parser.add_option('-k', '--ipaddr', dest='ip_address', type='string', default = '', help='IP address of job submitter')
    (options, args) = parser.parse_args()
    
    options.rvdString = options.rvdString.strip().upper()
    
    validate_options_handler(validateOptions, options)
    
    if options.genome:
        queue_name = 'talesf_genome'
    elif options.promoterome:
        queue_name = 'talesf_promoterome'
    else:
        queue_name = 'talesf_other'
    
    if options.nodeID != -1:
        
        if not celery_found:
            raise TaskError("nodeID option was provided but Celery is not installed")
        
        logger = create_logger(options.logFilepath)
        logger("Your task has been queued and will be processed when a worker node becomes available")
        
        from findRvdTAL import TalesfTask
        #if run from drupal then it should be queued as a task
        TalesfTask.apply_async(kwargs=vars(options), queue=queue_name)
        
    else:
        
        RunTalesfTask(options)
Exemple #5
0
def RunFindTALTask(options):

    logger = create_logger(options.logFilepath)

    logger("Beginning")

    if options.check_offtargets and options.offtargets_ncbi != "NA":
        logger(
            "Retrieving NCBI off-target sequence. This could take a while if this sequence hasn't been used recently and needs to be downloaded from NCBI."
        )

    with Conditional(
        options.check_offtargets and options.offtargets_ncbi != "NA", CachedEntrezFile(logger, options.offtargets_ncbi)
    ) as maybe_entrez_file:

        if options.check_offtargets:

            if not tfcount_found:
                raise TaskError("Non off-target counting worker attempted to process off-target counting task.")

            if options.offtargets_ncbi != "NA":

                logger("Finished retrieving NCBI off-target sequence.")

                # Validate downloaded sequence

                check_fasta_pasta(maybe_entrez_file.file)

                for record in FastaIterator(maybe_entrez_file.file, alphabet=generic_dna):
                    if len(record.seq) > OFFTARGET_COUNTING_SIZE_LIMIT:
                        raise TaskError(
                            "Off-Target counting is only supported for NCBI records where all individual sequences are under %d megabases in size"
                            % (OFFTARGET_COUNTING_SIZE_LIMIT / 1000000)
                        )

            offtarget_seq_filename = ""

            if options.offtargets_fasta != "NA":
                offtarget_seq_filename = options.offtargets_fasta
            elif options.offtargets_ncbi != "NA":
                offtarget_seq_filename = maybe_entrez_file.filepath
            elif options.genome:
                offtarget_seq_filename = GENOME_FILE % options.organism
            elif options.promoterome:
                offtarget_seq_filename = PROMOTEROME_FILE % options.organism
            else:
                offtarget_seq_filename = options.fasta

        strong_binding_RVDs = {"A": "NI", "C": "HD", "G": "NN", "T": "NG"}

        if options.gspec:
            strong_binding_RVDs["G"] = "NH"

        seq_file = open(options.fasta, "r")

        if options.outpath == "NA":
            output_filepath = options.outdir + options.job + options.outfile
        else:
            output_filepath = options.outpath

        out = open(output_filepath, "w")

        table_ignores = ["TAL1 length", "TAL2 length", "Spacer length"]

        out.write("table_ignores:" + ",".join(table_ignores) + "\n")

        strand_min = 15 if options.arraymin is None else options.arraymin
        strand_max = 20 if options.arraymax is None else options.arraymax

        spacer_min = 15 if options.min is None else options.min
        spacer_max = 30 if options.max is None else options.max

        u_bases = []

        if options.cupstream != 1:
            u_bases.append("T")

        if options.cupstream != 0:
            u_bases.append("C")

        out.write(
            "options_used:"
            + ", ".join(
                [
                    "array_min = " + str(strand_min),
                    "array_max = " + str(strand_max),
                    "spacer_min = " + str(spacer_min),
                    "spacer_max = " + str(spacer_max),
                    "upstream_base = " + (" or ".join(u_bases)),
                ]
            )
            + "\n"
        )

        offtarget_header = "\tOff-Target Counts" if options.check_offtargets else ""

        out.write(
            "Sequence Name\tCut Site\tTAL1 start\tTAL2 start\tTAL1 length\tTAL2 length\tSpacer length\tSpacer range\tTAL1 RVDs\tTAL2 RVDs\tPlus strand sequence\tUnique RE sites in spacer\t% RVDs HD or NN/NH"
            + offtarget_header
            + "\n"
        )

        binding_sites = []

        for gene in FastaIterator(seq_file, alphabet=generic_dna):

            sequence = str(gene.seq).upper()

            site_entry_counts = {}

            if options.filter == 1:
                if options.filterbase > len(sequence):
                    logger("Skipped %s as the provided cut site was greater than the sequence length" % (gene.id))
                    continue
                cut_site_positions = [options.filterbase]
            else:
                cut_site_positions = range(len(sequence))

            logger("Scanning %s for binding sites" % (gene.id))

            for i in cut_site_positions:

                cut_site_potential_sites = []

                for spacer_size in range(spacer_min, spacer_max + 1):

                    spacer_potential_sites = []

                    spacer_size_left = int(math.floor(float(spacer_size) / 2))
                    spacer_size_right = int(math.ceil(float(spacer_size) / 2))

                    if i < (strand_min + spacer_size_left + 1) or i > (
                        len(sequence) - (strand_min + spacer_size_right) - 1
                    ):
                        continue

                    for u_base in u_bases:

                        if u_base == "T":
                            d_base = "A"
                        elif u_base == "C":
                            d_base = "G"

                        u_pos_search_start = i - (strand_max + spacer_size_left) - 1

                        if u_pos_search_start < 0:
                            u_pos_search_start = 0

                        u_pos_search_end = i - (strand_min + spacer_size_left)

                        d_pos_search_start = i + (strand_min + spacer_size_right)
                        d_pos_search_end = i + (strand_max + spacer_size_right) + 1

                        u_positions = []

                        u_pos = 0

                        while True:

                            u_pos = sequence.rfind(u_base, u_pos_search_start, u_pos_search_end)

                            if u_pos == -1:
                                break
                            else:
                                u_pos_search_end = u_pos
                                u_positions.append(u_pos)

                        d_positions = []

                        d_pos = 0

                        while True:

                            d_pos = sequence.find(d_base, d_pos_search_start, d_pos_search_end)

                            if d_pos == -1:
                                break
                            else:
                                d_pos_search_start = d_pos + 1
                                d_positions.append(d_pos)

                        break_out = False

                        for u_pos in reversed(u_positions):

                            for d_pos in reversed(d_positions):

                                # uses inclusive start, exclusive end
                                tal1_start = u_pos + 1
                                tal1_end = i - spacer_size_left
                                tal1_seq = sequence[tal1_start:tal1_end]
                                tal2_start = i + spacer_size_right
                                tal2_end = d_pos
                                tal2_seq = sequence[tal2_start:tal2_end]

                                if not (
                                    (tal1_seq in site_entry_counts and tal2_seq in site_entry_counts[tal1_seq])
                                    or (tal1_seq in site_entry_counts and tal1_seq in site_entry_counts[tal1_seq])
                                    or (tal2_seq in site_entry_counts and tal1_seq in site_entry_counts[tal2_seq])
                                    or (tal2_seq in site_entry_counts and tal2_seq in site_entry_counts[tal2_seq])
                                ):

                                    bad_site = False

                                    cg_count = 0

                                    tal1_rvd = []

                                    for c in tal1_seq:

                                        if c not in strong_binding_RVDs:
                                            bad_site = True
                                            break

                                        if c == "C" or c == "G":
                                            cg_count += 1

                                        tal1_rvd.append(strong_binding_RVDs[c])

                                    if bad_site:
                                        continue

                                    tal1_rvd = " ".join(tal1_rvd)

                                    tal2_rvd = []

                                    for c in reverseComplement(tal2_seq):

                                        if c not in strong_binding_RVDs:
                                            bad_site = True
                                            break

                                        if c == "C" or c == "G":
                                            cg_count += 1

                                        tal2_rvd.append(strong_binding_RVDs[c])

                                    if bad_site:
                                        continue

                                    tal2_rvd = " ".join(tal2_rvd)

                                    if options.filter == 0:
                                        break_out = True

                                    binding_site = BindingSite(
                                        seq_id=gene.id,
                                        cutsite=i,
                                        seq1_start=tal1_start,
                                        seq1_end=tal1_end,
                                        seq1_seq=tal1_seq,
                                        seq1_rvd=tal1_rvd,
                                        spacer_start=tal1_end,
                                        spacer_end=tal2_start,
                                        spacer_seq=sequence[tal1_end:tal2_start],
                                        seq2_start=tal2_start,
                                        seq2_end=tal2_end,
                                        seq2_seq=tal2_seq,
                                        seq2_rvd=tal2_rvd,
                                        upstream=u_base,
                                        cg_percent=int(
                                            round(float(cg_count) / (len(tal1_seq) + len(tal2_seq)), 2) * 100
                                        ),
                                    )

                                    findRESitesInSpacer(sequence, binding_site)

                                    if binding_site.seq1_seq not in site_entry_counts:
                                        site_entry_counts[binding_site.seq1_seq] = {}

                                    if binding_site.seq2_seq not in site_entry_counts[tal1_seq]:
                                        site_entry_counts[binding_site.seq1_seq][binding_site.seq2_seq] = []

                                    site_entry_counts[binding_site.seq1_seq][binding_site.seq2_seq].append(binding_site)
                                    spacer_potential_sites.append(binding_site)

                                if break_out:
                                    break

                            if break_out:
                                break

                    if len(spacer_potential_sites) > 0:
                        if options.filter == 0:
                            cut_site_potential_sites.append(reduce(filterByTALSize, spacer_potential_sites))
                        else:
                            cut_site_potential_sites.extend(spacer_potential_sites)

                if len(cut_site_potential_sites) > 0:
                    if options.filter == 0:
                        binding_sites.append(reduce(filterByTALSize, cut_site_potential_sites))
                    else:
                        binding_sites.extend(cut_site_potential_sites)

        if options.streubel:
            binding_sites[:] = list(ifilterfalse(filterStreubel, binding_sites))

        if options.check_offtargets:

            if len(binding_sites) > 0:

                off_target_pairs = []

                for i, binding_site in enumerate(binding_sites):
                    off_target_pairs.append([binding_site.seq1_rvd, binding_site.seq2_rvd])

                off_target_counts = PairedTargetFinderCountTask(
                    offtarget_seq_filename,
                    options.logFilepath,
                    options.cupstream,
                    3.0,
                    spacer_min,
                    spacer_max,
                    off_target_pairs,
                )

                for i, binding_site in enumerate(binding_sites):
                    binding_site.offtarget_counts = off_target_counts[i]

        for i, binding_site in enumerate(binding_sites):

            output_items = [
                str(binding_site.seq_id),
                str(binding_site.cutsite),
                str(binding_site.seq1_start),
                str(binding_site.seq2_end - 1),
                str(binding_site.seq1_end - binding_site.seq1_start),
                str(binding_site.seq2_end - binding_site.seq2_start),
                str(binding_site.spacer_end - binding_site.spacer_start),
                str(binding_site.spacer_start) + "-" + str(binding_site.spacer_end - 1),
                binding_site.seq1_rvd,
                binding_site.seq2_rvd,
                binding_site.upstream
                + " "
                + binding_site.seq1_seq
                + " "
                + binding_site.spacer_seq.lower()
                + " "
                + binding_site.seq2_seq
                + " "
                + ("A" if binding_site.upstream == "T" else "G"),
                binding_site.re_sites,
                str(binding_site.cg_percent),
            ]

            if options.check_offtargets:
                output_items.append(" ".join(str(binding_site.offtarget_counts[x]) for x in range(5)))

            out.write("\t".join(output_items) + "\n")

        out.close()
        seq_file.close()

        logger("Finished")
Exemple #6
0
def RunFindTALOldTask(options):
    
    logger = create_logger(options.logFilepath)
    
    seq_file = open(options.fasta, 'r')
    
    logger("Beginning")
    
    #Set other parameters
    if options.arraymin is None or options.arraymax is None:
        half_site_size = range(15, 31)
    else:
        half_site_size = range(options.arraymin, options.arraymax + 1)
    
    if options.min is None or options.max is None:
        spacer_size = [15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
    else:
        spacer_size = range(options.min, options.max + 1)
    
    #Initialize half site data structures:
    gene_binding_sites = {}
    
    #Open and read FASTA sequence file
    genes = []
    
    for gene in FastaIterator(seq_file, alphabet=generic_dna):
        genes.append(gene)
    
    seq_file.close()
    
    for gene in genes:
        gene.seq = gene.seq.upper()
    
    #Set up binding site counter for each gene
    binding_site_count_genes = {}
    
    #Scan each gene sequence:
    for gene in genes: #Scan sequence based on above criteria:
        logger("Scanning %s for binding sites" % (gene.id)) 
        binding_site_count_genes[gene] = 0
        sequence = gene.seq
        
        #Check each position along the sequence for possible binding sites using all combinations of binding site lengths and spacer lengths
        for size1 in half_site_size:
            for spacer_len in spacer_size:
                for size2 in half_site_size:
                    for sindex in range(1, len(sequence)-(size1+spacer_len+size2)):
                        
                        #Check for T at -1 for each half_site (A on plus strand of second halfsite)
    
                        if ((options.cupstream != 1 and sequence[sindex-1] == 'T' and sequence[sindex+size1+spacer_len+size2] == 'A') or (options.cupstream != 0 and sequence[sindex-1] == 'C' and sequence[sindex+size1+spacer_len+size2] == 'G')) and len(set(DNA) | set(sequence[sindex:sindex+size1+spacer_len+size2])) ==4:
                            half_site1 = sequence[sindex:sindex+size1]
                            half_site2_plus = sequence[sindex+size1+spacer_len:sindex+size1+spacer_len+size2]
                            half_site2_minus = sequence[sindex+size1+spacer_len:sindex+size1+spacer_len+size2].reverse_complement()
                            spacer_seq = sequence[sindex+size1:sindex+size1+spacer_len]
                            
                            Binding_site_flag = True
    
                            #Check for not T at 1 (A at 1 for second halfsite) on both sites
                            if Binding_site_flag==True and options.t1==True:
                                if sequence[sindex] != 'T' and sequence[sindex+size1+spacer_len+size2-1]!= 'A':
                                    Binding_site_flag=True
                                else:
                                    Binding_site_flag=False
                        
                            #Check not A at 2 (T on plus for second halfsite) on both sites
                            if Binding_site_flag==True and options.a2==True:
                                if sequence[sindex+1] !='A' and sequence[sindex+size1+spacer_len+size2-2]!='T':
                                    Binding_site_flag=True
                                else:
                                    Binding_site_flag=False
    
                            #Require T at end of both sites so bound by NG (A on plus for second half site)
                            if Binding_site_flag==True and options.tn==True:
                                if sequence[sindex+size1-1] == 'T' and sequence[sindex+size1+spacer_len] =='A':
                                    Binding_site_flag=True
                                else:
                                    Binding_site_flag=False
                            
                            #Require last position to not be G's
                            if Binding_site_flag==True and options.gn==True:
                                if sequence[sindex+size1-1] != 'G' and sequence[sindex+size1+spacer_len] != 'C':
                                    Binding_site_flag=True
                                else:
                                    Binding_site_flag=False
    
                            #Check nucleotide composition of the binding site
                            if Binding_site_flag==True and options.comp==True:
                                A1 = half_site1.count('A')/float(len(half_site1))
                                C1 = half_site1.count('C')/float(len(half_site1))
                                G1 = half_site1.count('G')/float(len(half_site1))
                                T1 = half_site1.count('T')/float(len(half_site1))
                            
                                A2 = half_site2_minus.count('A')/float(len(half_site2_minus))
                                C2 = half_site2_minus.count('C')/float(len(half_site2_minus))
                                G2 = half_site2_minus.count('G')/float(len(half_site2_minus))
                                T2 = half_site2_minus.count('T')/float(len(half_site2_minus))
                
                                if A1<=percent_comp_range_top['A'] and A1>=percent_comp_range_bottom['A'] and C1<=percent_comp_range_top['C'] and C1>=percent_comp_range_bottom['C'] and G1<=percent_comp_range_top['G'] and G1>=percent_comp_range_bottom['G'] and T1<=percent_comp_range_top['T'] and T1>=percent_comp_range_bottom['T']:
                                    if A2<=percent_comp_range_top['A'] and A2>=percent_comp_range_bottom['A'] and C2<=percent_comp_range_top['C'] and C2>=percent_comp_range_bottom['C'] and G2<=percent_comp_range_top['G'] and G2>=percent_comp_range_bottom['G'] and T2<=percent_comp_range_top['T'] and T2>=percent_comp_range_bottom['T']:
                                        Binding_site_flag=True
                                    else:
                                        Binding_site_flag=False
                                else:
                                    Binding_site_flag=False
                                    
                            #Create a binding site if all enforced rules have been met
                            if Binding_site_flag==True:
                                binding_site = Binding_site(perfectTAL1 = 'none', perfectTAL2 = 'none', start1 = sindex, start2=sindex+size1+spacer_len+size2-1, seq1 = half_site1, seq2_plus=half_site2_plus, spacer=spacer_len, spacerseq=spacer_seq, upstream = sequence[sindex-1])
                                if gene not in gene_binding_sites.keys():
                                    gene_binding_sites[gene] = {}
    
                                if sindex not in gene_binding_sites[gene].keys():
                                    gene_binding_sites[gene][sindex] = []
                                        
                                gene_binding_sites[gene][sindex].append(binding_site)
                                binding_site_count_genes[gene] +=1
    
    #Compute TALs for each gene, using "strong-binding" RVDs for each nucleotide (binds the nucleotide more than half the time and we have more than 10 observations)
    logger('Designing best scoring perfect TALs for each potential site...')
    strong_binding_RVDs = {'A':'NI', 'C':'HD', 'G':'NN', 'T':'NG'}
    RVD_pairs = {} #dictionary of RVD pair counts indexed by gene, RVD1 and RVD2
    for gene in gene_binding_sites.keys():
        RVD_pairs[gene] = {}
        for start in gene_binding_sites[gene].keys():
            
            #Find the perfect RVD sequence from each potential plus strand start site
            for binding_site in gene_binding_sites[gene][start]:
                TAL_1 = []
                TAL_2 = []
    
                for bindex in range(0, len(binding_site.seq1)):
                    TAL_1.append(strong_binding_RVDs[binding_site.seq1[bindex]])
    
                TAL_1 = ' '.join(TAL_1)
                
                for bindex in range(0, len(binding_site.seq2_minus)):
                    TAL_2.append(strong_binding_RVDs[binding_site.seq2_minus[bindex]])
    
                TAL_2 = ' '.join(TAL_2)
    
                binding_site.perfectTAL1 = TAL_1
                binding_site.perfectTAL2 = TAL_2
                
                if TAL_1 not in RVD_pairs[gene].keys():
                    RVD_pairs[gene][TAL_1] = {}
                if TAL_2 not in RVD_pairs[gene][TAL_1].keys():
                    RVD_pairs[gene][TAL_1][TAL_2] = 0
                RVD_pairs[gene][TAL_1][TAL_2] += 1
    
    #Find lists of binding sites in each gene with unique (for that gene)  perfect RVD sequences
    binding_sites_unique_plus_minus_pairs = {} #binding sites whose RVD sequences don't make up other binding sites in the gene
    
    for gene in gene_binding_sites.keys():
        binding_sites_unique_plus_minus_pairs[gene] = {}
        for start in gene_binding_sites[gene].keys():
            for binding_site in gene_binding_sites[gene][start]:
                RVD1 = binding_site.perfectTAL1
                RVD2 = binding_site.perfectTAL2
                
                if (RVD1 not in RVD_pairs[gene].keys() or RVD1 not in RVD_pairs[gene][RVD1].keys() or RVD_pairs[gene][RVD1][RVD1] == 1) and (RVD1 not in RVD_pairs[gene].keys() or RVD2 not in RVD_pairs[gene][RVD1].keys() or RVD_pairs[gene][RVD1][RVD2] == 1) and (RVD2 not in RVD_pairs[gene].keys() or RVD1 not in RVD_pairs[gene][RVD2].keys() or RVD_pairs[gene][RVD2][RVD1] == 1) and (RVD2 not in RVD_pairs[gene].keys() or RVD2 not in RVD_pairs[gene][RVD2].keys() or RVD_pairs[gene][RVD2][RVD2] == 1):
                    if start not in binding_sites_unique_plus_minus_pairs[gene].keys():
                        binding_sites_unique_plus_minus_pairs[gene][start] = []
                    
                    binding_sites_unique_plus_minus_pairs[gene][start].append(binding_site)
    
    
    #Check binding sites for unique Restriction endonuclease sites within the spacer.
    #Unique sites are those that occur once in the spacer and do not occur in the 250 bases on either side of the spacer.
    logger('Searching for restriction enzymes sites within each spacer...')
    for gene in binding_sites_unique_plus_minus_pairs.keys():
        for start_site in binding_sites_unique_plus_minus_pairs[gene].keys():
            for binding_site in binding_sites_unique_plus_minus_pairs[gene][start_site]:
    
                enzymes_in_spacer = []
            
                spacer_start = binding_site.start1 + len(binding_site.seq1)
                spacer_end = binding_site.start1 + len(binding_site.seq1) + binding_site.spacer-1
                
                #identify sequence to check around the spacer for unique-ness
                if spacer_start >= 250:
                    seq_check_start = spacer_start - 250
                else:
                    seq_check_start = 0
                
                if len(gene.seq) - spacer_end >= 250:
                    seq_check_end = spacer_end + 250 + 1
                else:
                    seq_check_end = len(gene.seq)
                
                #For each enzyme check if it occurs once in the spacer:
                for enzyme in NEB_RE_sites:
                    if len(NEB_RE_sites[enzyme]["compiled"].findall(str(binding_site.spacerseq))) == 1:
                
                        #If unique in spacer, check that it doesen't occur in  the flanking sequence                
                        if len(NEB_RE_sites[enzyme]["compiled"].findall(str(gene.seq[seq_check_start : seq_check_end]))) == 1:                  
                            enzymes_in_spacer.append(enzyme)
                
                #Create a string listing the enzymes and their sequences that can printed in the output
                enzyme_string = ' '.join(["%s:%s" % (enzyme, NEB_RE_sites[enzyme]["short"]) for enzyme in enzymes_in_spacer])
                
                if len(enzyme_string) == 0:
                    enzyme_string = 'none'
                
                #append enzyme string to binding site object
                binding_site.re_sites = enzyme_string
                
    #Print output results to file: binding sites
    if options.outpath == 'NA':
      filename = options.outdir + options.job + options.outfile
    else:
      filename = options.outpath
    
    out = open(filename, 'w')
    table_ignores = ["TAL1 length", "TAL2 length", "Spacer length"]
    out.write("table_ignores:" + string.join(table_ignores, ",") + "\n")
    
    u_bases = []
    
    if options.cupstream != 1:
        u_bases.append("T")
    
    if options.cupstream != 0:
        u_bases.append("C")
        
    out.write("options_used:" + ', '.join([
        "array_min = " + str(options.arraymin),
        "array_max = " + str(options.arraymax),
        "spacer_min = " + str(options.min),
        "spacer_max = " + str(options.max),
        "upstream_base = " + (" or ".join(u_bases)),
        ("No T at position 1" if options.t1 else ""),
        ("No A at position 1" if options.a2 else ""),
        ("Sites must end in a T" if options.tn else ""),
        ("Sites may not end in G/NN" if options.gn else ""),
        ("Base composition rules enforced" if options.comp else ""),
    ]) + "\n")
    
    out.write('Sequence Name\tTAL1 start\tTAL2 start\tTAL1 length\tTAL2 length\tSpacer length\tSpacer range\tTAL1 RVDs\tTAL2 RVDs\tPlus strand sequence\tUnique_RE_sites_in_spacer\n')
    if len(binding_sites_unique_plus_minus_pairs.keys()) > 0:
        for gene in binding_sites_unique_plus_minus_pairs.keys():
            for start_site in binding_sites_unique_plus_minus_pairs[gene].keys():
                for binding_site in binding_sites_unique_plus_minus_pairs[gene][start_site]:
                    out.write(gene.id + '\t' + str(binding_site.start1) + '\t' + str(binding_site.start2) + '\t' + str(len(binding_site.seq1)) + '\t' + str(len(binding_site.seq2_plus)) + '\t' + str(binding_site.spacer) + '\t' + str(binding_site.start1+len(binding_site.seq1)) + '-' + str(binding_site.start1+len(binding_site.seq1) + binding_site.spacer-1) + '\t' + binding_site.perfectTAL1 + '\t' +  binding_site.perfectTAL2 + '\t' + binding_site.upstream.upper() + " " + str(binding_site.seq1) + ' ' + str(binding_site.spacerseq).lower() + ' ' + str(binding_site.seq2_plus) + " " + ("A" if binding_site.upstream == "T" else "G") + '\t' + binding_site.re_sites + '\n')
    
    out.close()
    logger('Finished')
def RunFindSingleTALSiteTask(options):

    logger = create_logger(options.logFilepath)
    
    logger("Beginning")
    
    if options.check_offtargets and options.offtargets_ncbi != "NA":
        logger("Retrieving NCBI off-target sequence. This could take a while if this sequence hasn't been used recently and needs to be downloaded from NCBI.")
    
    with Conditional(options.check_offtargets and options.offtargets_ncbi != "NA", CachedEntrezFile(logger, options.offtargets_ncbi)) as maybe_entrez_file:
        
        if options.check_offtargets:
            
            if not tfcount_found:
                raise TaskError("Non off-target counting worker attempted to process off-target counting task.")
            
            if options.offtargets_ncbi != "NA":
                
                logger("Finished retrieving NCBI off-target sequence.")
                
                # Validate downloaded sequence
                
                check_fasta_pasta(maybe_entrez_file.file)
                
                for record in FastaIterator(maybe_entrez_file.file, alphabet=generic_dna):
                    if len(record.seq) > OFFTARGET_COUNTING_SIZE_LIMIT:
                        raise TaskError("Off-Target counting is only supported for NCBI records where all individual sequences are under %d megabases in size" % (OFFTARGET_COUNTING_SIZE_LIMIT / 1000000))
            
            offtarget_seq_filename = ""
            
            if options.offtargets_fasta != "NA":
                offtarget_seq_filename = options.offtargets_fasta
            elif options.offtargets_ncbi != "NA":
                offtarget_seq_filename = maybe_entrez_file.filepath
            elif options.genome:
                offtarget_seq_filename = GENOME_FILE % options.organism
            elif options.promoterome:
                offtarget_seq_filename = PROMOTEROME_FILE % options.organism
            else:
                offtarget_seq_filename = options.fasta
        
        strong_binding_RVDs = {
            'A':'NI',
            'C':'HD',
            'G':'NN',
            'T':'NG'
        }
        
        if options.gspec:
            strong_binding_RVDs['G'] = 'NH'
        
        seq_file = open(options.fasta, 'r')
        
        #Set other parameters
        if options.arraymin is None or options.arraymax is None:
            half_site_size = range(15, 31)
        else:
            half_site_size = range(options.arraymin, options.arraymax + 1)
        
        #Initialize half site data structures:
        gene_binding_sites = {}
        
        #Open and read FASTA sequence file
        genes = []
        
        for gene in FastaIterator(seq_file, alphabet=generic_dna):
            genes.append(gene)
        
        seq_file.close()
        
        for gene in genes:
            gene.seq = gene.seq.upper()
        
        #Scan each gene sequence:
        for gene in genes: #Scan sequence based on above criteria:
            logger("Scanning %s for binding sites" % (gene.id))
            sequence = gene.seq
            
            #Check each position along the sequence for possible binding sites using all combinations of binding site lengths and spacer lengths
            for size1 in half_site_size:
                for sindex in range(1, len(sequence)-size1):
            
                    #Check for T at -1
                    if ((options.cupstream != 1 and sequence[sindex-1] == 'T') or (options.cupstream != 0 and sequence[sindex-1] == 'C')) and len(set(DNA) | set(sequence[sindex:sindex+size1])) ==4:
                        half_site1 = sequence[sindex:sindex+size1]
                        Binding_site_flag = True
        
                        #Check for not T at 1
                        if Binding_site_flag==True and options.t1==True:
                            if sequence[sindex] != 'T':
                                Binding_site_flag=True
                            else:
                                Binding_site_flag=False
                            
        
                        #Check not A at 2
                        if Binding_site_flag==True and options.a2==True:
                            if sequence[sindex+1] !='A':
                                Binding_site_flag=True
                            else:
                                Binding_site_flag=False
        
                        #Require T at end
                        if Binding_site_flag==True and options.tn==True:
                            if sequence[sindex+size1-1] == 'T':
                                Binding_site_flag=True
                            else:
                                Binding_site_flag=False
                                
                        #Require last position to not be G's
                        if Binding_site_flag==True and options.gn==True:
                            if sequence[sindex+size1-1] != 'G':
                                Binding_site_flag=True
                            else:
                                Binding_site_flag=False
        
                        #Check nucleotide composition of the binding site
                        if Binding_site_flag==True and options.comp==True:
                            A1 = half_site1.count('A')/float(len(half_site1))
                            C1 = half_site1.count('C')/float(len(half_site1))
                            G1 = half_site1.count('G')/float(len(half_site1))
                            T1 = half_site1.count('T')/float(len(half_site1))
                    
                            if A1<=percent_comp_range_top['A'] and A1>=percent_comp_range_bottom['A'] and C1<=percent_comp_range_top['C'] and C1>=percent_comp_range_bottom['C'] and G1<=percent_comp_range_top['G'] and G1>=percent_comp_range_bottom['G'] and T1<=percent_comp_range_top['T'] and T1>=percent_comp_range_bottom['T']:
                                Binding_site_flag=True
                            else:
                                Binding_site_flag=False
        
                                        
                        #Create a binding site if all enforced rules have been met
                        if Binding_site_flag==True:
                            binding_site = Binding_site(perfectTAL1 = 'none', start1 = sindex, seq1 = half_site1, is_plus=True, upstream=sequence[sindex-1])
                            if gene not in gene_binding_sites.keys():
                                gene_binding_sites[gene] = {}
        
                            if sindex not in gene_binding_sites[gene].keys():
                                gene_binding_sites[gene][sindex] = []
                                            
                            gene_binding_sites[gene][sindex].append(binding_site)
        
        
                if options.revcomp==True: #Search for binding sites on the reverse complement strand
                    for sindex in range(size1-1, len(sequence)-1):
                            
                        #Check for T at -1 for each half_site (A on plus strand)
                        
                        if ((options.cupstream != 1 and sequence[sindex+1] == 'A') or (options.cupstream != 0 and sequence[sindex+1] == 'G')) and len(set(DNA) | set(sequence[sindex-size1+1:sindex+1])) == 4:
                            half_site1 = sequence[sindex-size1+1:sindex+1]
                            Binding_site_flag = True
        
                            #Check for not T at 1 (A at 1 on plus strand)
                            if Binding_site_flag==True and options.t1==True:
                                if sequence[sindex] != 'A':
                                    Binding_site_flag=True
                                else:
                                    Binding_site_flag=False
                            
                            #Check not A at 2 (T on plus strand)
                            if Binding_site_flag==True and options.a2==True:
                                if sequence[sindex-1] !='T':
                                    Binding_site_flag=True
                                else:
                                    Binding_site_flag=False
        
                            #Require T at end so bound by NG (A on plus)
                            if Binding_site_flag==True and options.tn==True:
                                if sequence[sindex-size1+1] =='A':
                                    Binding_site_flag=True
                                else:
                                    Binding_site_flag=False
                                
                            #Require last position to not be G (C on plus)
                            if Binding_site_flag==True and options.gn==True:
                                if sequence[sindex-size1+1] != 'C':
                                    Binding_site_flag=True
                                else:
                                    Binding_site_flag=False
        
                            #Check nucleotide composition of the binding site
                            if Binding_site_flag==True and options.comp==True:
                                
                                A2 = half_site1.count('T')/float(len(half_site1))
                                C2 = half_site1.count('G')/float(len(half_site1))
                                G2 = half_site1.count('C')/float(len(half_site1))
                                T2 = half_site1.count('A')/float(len(half_site1))
                    
                                if A2<=percent_comp_range_top['A'] and A2>=percent_comp_range_bottom['A'] and C2<=percent_comp_range_top['C'] and C2>=percent_comp_range_bottom['C'] and G2<=percent_comp_range_top['G'] and G2>=percent_comp_range_bottom['G'] and T2<=percent_comp_range_top['T'] and T2>=percent_comp_range_bottom['T']:
                                    Binding_site_flag=True
                                else:
                                    Binding_site_flag=False
                                        
                            #Create a binding site if all enforced rules have been met
                            if Binding_site_flag==True:
                                binding_site = Binding_site(perfectTAL1 = 'none', start1 = sindex, seq1 = half_site1, is_plus=False, upstream=sequence[sindex+1])
                                if gene not in gene_binding_sites.keys():
                                    gene_binding_sites[gene] = {}
        
                                if sindex not in gene_binding_sites[gene].keys():
                                    gene_binding_sites[gene][sindex] = []
                                        
                                gene_binding_sites[gene][sindex].append(binding_site)
        
        
        #Compute TALs for each gene, using "strong-binding" RVDs for each nucleotide (binds the nucleotide more than half the time and we have more than 10 observations)
        logger('Designing best scoring perfect TALs for each potential site...')
        
        for gene in gene_binding_sites.keys():
            for start in gene_binding_sites[gene].keys():
                #Find the perfect RVD sequence from each potential plus strand start site
                for binding_site in gene_binding_sites[gene][start]:
                    TAL_1 = []
                    if binding_site.is_plus:
                        for bindex in range(0, len(binding_site.seq1)):
                            TAL_1.append(strong_binding_RVDs[binding_site.seq1[bindex]])
        
                        TAL_1 = ' '.join(TAL_1)
                    else:
                        rev_comp_seq = binding_site.seq1.reverse_complement()
                        for bindex in range(0, len(rev_comp_seq)):
                            TAL_1.append(strong_binding_RVDs[rev_comp_seq[bindex]])
                        TAL_1 = ' '.join(TAL_1)
        
                    binding_site.perfectTAL1 = TAL_1
        
        #Print output results to file: binding sites
        
        #filename = 'upload/'+ options.job + '_TALEN_pairs_all.txt'
        
        if options.outpath == 'NA':
          filename = options.outdir + options.job + options.outfile
        else:
          filename = options.outpath
        
        binding_sites = []
        if len(gene_binding_sites.keys()) > 0:
            for gene in sorted(gene_binding_sites.keys()):
                for start_site in gene_binding_sites[gene].keys():
                    for binding_site in gene_binding_sites[gene][start_site]:
                        binding_site.gene_id = gene.id
                        binding_sites.append(binding_site)
        
        if options.check_offtargets:
            
            if len(binding_sites) > 0:
                
                off_target_seqs = []
                
                for i, binding_site in enumerate(binding_sites):
                    off_target_seqs.append(binding_site.perfectTAL1)
                
                off_target_counts = TargetFinderCountTask(offtarget_seq_filename, options.logFilepath, options.cupstream, 3.0, off_target_seqs)
                
                for i, binding_site in enumerate(binding_sites):
                    binding_site.offtarget_count = off_target_counts[i]
        
        out = open(filename, 'w')
        table_ignores = []
        if not options.revcomp:
            table_ignores.append("Plus strand sequence")
        if len(table_ignores) > 0:
            out.write("table_ignores:" + string.join(table_ignores, ",") + "\n")
        
        u_bases = []
        
        if options.cupstream != 1:
            u_bases.append("T")
        
        if options.cupstream != 0:
            u_bases.append("C")
            
        out.write("options_used:" + ', '.join([
            "array_min = " + str(options.arraymin),
            "array_max = " + str(options.arraymax),
            "upstream_base = " + (" or ".join(u_bases)),
            ("No T at position 1" if options.t1 else ""),
            ("No A at position 1" if options.a2 else ""),
            ("Sites must end in a T" if options.tn else ""),
            ("Sites may not end in G/NN" if options.gn else ""),
            ("Base composition rules enforced" if options.comp else ""),
            ("Search reverse complement" if options.revcomp else ""),
        ]) + "\n")
        
        offtarget_header = "\tOff-Target Counts" if options.check_offtargets else ""
        
        out.write('Sequence Name\tTAL start\tTAL length\tRVD sequence\tStrand\tTarget sequence\tPlus strand sequence' + offtarget_header + '\n')
        
        for i, binding_site in enumerate(binding_sites):
            
            offtarget_string = ""
            
            if options.check_offtargets:
                offtarget_string = "\t%d" % binding_site.offtarget_count
            
            if binding_site.is_plus:
                out.write(binding_site.gene_id + '\t' + str(binding_site.start1) + '\t' + str(len(binding_site.seq1)) + '\t' + binding_site.perfectTAL1 + '\t' +  'Plus' + '\t' + binding_site.upstream + " " + str(binding_site.seq1) + '\t' + binding_site.upstream + " " + str(binding_site.seq1) + offtarget_string + '\n')
            else:
                out.write(binding_site.gene_id + '\t' + str(binding_site.start1) + '\t' + str(len(binding_site.seq1)) + '\t' + binding_site.perfectTAL1 + '\t' +  'Minus' + '\t' + ("T" if binding_site.upstream == "A" else "C") + " " + str(binding_site.seq1.reverse_complement()) + '\t' + str(binding_site.seq1) + " " + binding_site.upstream + offtarget_string + '\n')
        
        out.close()
        
        logger('Finished')
def RunFindSingleTALSiteTask(options):

    logger = create_logger(options.logFilepath)
    
    strong_binding_RVDs = {
        'A':'NI',
        'C':'HD',
        'G':'NN',
        'T':'NG'
    }
    
    if options.gspec:
        strong_binding_RVDs['G'] = 'NH'
    
    seq_file = open(options.fasta, 'r')

    #Set other parameters
    if options.arraymin is None or options.arraymax is None:
        half_site_size = range(15, 31)
    else:
        half_site_size = range(options.arraymin, options.arraymax + 1)
    
    #Initialize half site data structures:
    gene_binding_sites = {}
    
    #Open and read FASTA sequence file
    genes = []
    
    for gene in FastaIterator(seq_file, alphabet=generic_dna):
        genes.append(gene)
    
    seq_file.close()
    
    for gene in genes:
        gene.seq = gene.seq.upper()
    
    #Scan each gene sequence:
    for gene in genes: #Scan sequence based on above criteria:
        logger("Scanning %s for binding sites" % (gene.id))
        sequence = gene.seq
        
        #Check each position along the sequence for possible binding sites using all combinations of binding site lengths and spacer lengths
        for size1 in half_site_size:
            for sindex in range(1, len(sequence)-size1):
        
                #Check for T at -1
                if ((options.cupstream != 1 and sequence[sindex-1] == 'T') or (options.cupstream != 0 and sequence[sindex-1] == 'C')) and len(set(DNA) | set(sequence[sindex:sindex+size1])) ==4:
                    half_site1 = sequence[sindex:sindex+size1]
                    Binding_site_flag = True
    
                    #Check for not T at 1
                    if Binding_site_flag==True and options.t1==True:
                        if sequence[sindex] != 'T':
                            Binding_site_flag=True
                        else:
                            Binding_site_flag=False
                        
    
                    #Check not A at 2
                    if Binding_site_flag==True and options.a2==True:
                        if sequence[sindex+1] !='A':
                            Binding_site_flag=True
                        else:
                            Binding_site_flag=False
    
                    #Require T at end
                    if Binding_site_flag==True and options.tn==True:
                        if sequence[sindex+size1-1] == 'T':
                            Binding_site_flag=True
                        else:
                            Binding_site_flag=False
                            
                    #Require last position to not be G's
                    if Binding_site_flag==True and options.gn==True:
                        if sequence[sindex+size1-1] != 'G':
                            Binding_site_flag=True
                        else:
                            Binding_site_flag=False
    
                    #Check nucleotide composition of the binding site
                    if Binding_site_flag==True and options.comp==True:
                        A1 = half_site1.count('A')/float(len(half_site1))
                        C1 = half_site1.count('C')/float(len(half_site1))
                        G1 = half_site1.count('G')/float(len(half_site1))
                        T1 = half_site1.count('T')/float(len(half_site1))
                
                        if A1<=percent_comp_range_top['A'] and A1>=percent_comp_range_bottom['A'] and C1<=percent_comp_range_top['C'] and C1>=percent_comp_range_bottom['C'] and G1<=percent_comp_range_top['G'] and G1>=percent_comp_range_bottom['G'] and T1<=percent_comp_range_top['T'] and T1>=percent_comp_range_bottom['T']:
                            Binding_site_flag=True
                        else:
                            Binding_site_flag=False
    
                                    
                    #Create a binding site if all enforced rules have been met
                    if Binding_site_flag==True:
                        binding_site = Binding_site(perfectTAL1 = 'none', start1 = sindex, seq1 = half_site1, is_plus=True, upstream=sequence[sindex-1])
                        if gene not in gene_binding_sites.keys():
                            gene_binding_sites[gene] = {}
    
                        if sindex not in gene_binding_sites[gene].keys():
                            gene_binding_sites[gene][sindex] = []
                                        
                        gene_binding_sites[gene][sindex].append(binding_site)
    
    
            if options.revcomp==True: #Search for binding sites on the reverse complement strand
                for sindex in range(size1-1, len(sequence)-1):
                        
                    #Check for T at -1 for each half_site (A on plus strand)
                    
                    if ((options.cupstream != 1 and sequence[sindex+1] == 'A') or (options.cupstream != 0 and sequence[sindex+1] == 'G')) and len(set(DNA) | set(sequence[sindex-size1+1:sindex+1])) == 4:
                        half_site1 = sequence[sindex-size1+1:sindex+1]
                        Binding_site_flag = True
    
                        #Check for not T at 1 (A at 1 on plus strand)
                        if Binding_site_flag==True and options.t1==True:
                            if sequence[sindex] != 'A':
                                Binding_site_flag=True
                            else:
                                Binding_site_flag=False
                        
                        #Check not A at 2 (T on plus strand)
                        if Binding_site_flag==True and options.a2==True:
                            if sequence[sindex-1] !='T':
                                Binding_site_flag=True
                            else:
                                Binding_site_flag=False
    
                        #Require T at end so bound by NG (A on plus)
                        if Binding_site_flag==True and options.tn==True:
                            if sequence[sindex-size1+1] =='A':
                                Binding_site_flag=True
                            else:
                                Binding_site_flag=False
                            
                        #Require last position to not be G (C on plus)
                        if Binding_site_flag==True and options.gn==True:
                            if sequence[sindex-size1+1] != 'C':
                                Binding_site_flag=True
                            else:
                                Binding_site_flag=False
    
                        #Check nucleotide composition of the binding site
                        if Binding_site_flag==True and options.comp==True:
                            
                            A2 = half_site1.count('T')/float(len(half_site1))
                            C2 = half_site1.count('G')/float(len(half_site1))
                            G2 = half_site1.count('C')/float(len(half_site1))
                            T2 = half_site1.count('A')/float(len(half_site1))
                
                            if A2<=percent_comp_range_top['A'] and A2>=percent_comp_range_bottom['A'] and C2<=percent_comp_range_top['C'] and C2>=percent_comp_range_bottom['C'] and G2<=percent_comp_range_top['G'] and G2>=percent_comp_range_bottom['G'] and T2<=percent_comp_range_top['T'] and T2>=percent_comp_range_bottom['T']:
                                Binding_site_flag=True
                            else:
                                Binding_site_flag=False
                                    
                        #Create a binding site if all enforced rules have been met
                        if Binding_site_flag==True:
                            binding_site = Binding_site(perfectTAL1 = 'none', start1 = sindex, seq1 = half_site1, is_plus=False, upstream=sequence[sindex+1])
                            if gene not in gene_binding_sites.keys():
                                gene_binding_sites[gene] = {}
    
                            if sindex not in gene_binding_sites[gene].keys():
                                gene_binding_sites[gene][sindex] = []
                                    
                            gene_binding_sites[gene][sindex].append(binding_site)
    
    
    #Compute TALs for each gene, using "strong-binding" RVDs for each nucleotide (binds the nucleotide more than half the time and we have more than 10 observations)
    logger('Designing best scoring perfect TALs for each potential site...')
    
    for gene in gene_binding_sites.keys():
        for start in gene_binding_sites[gene].keys():
            #Find the perfect RVD sequence from each potential plus strand start site
            for binding_site in gene_binding_sites[gene][start]:
                TAL_1 = []
                if binding_site.is_plus:
                    for bindex in range(0, len(binding_site.seq1)):
                        TAL_1.append(strong_binding_RVDs[binding_site.seq1[bindex]])
    
                    TAL_1 = ' '.join(TAL_1)
                else:
                    rev_comp_seq = binding_site.seq1.reverse_complement()
                    for bindex in range(0, len(rev_comp_seq)):
                        TAL_1.append(strong_binding_RVDs[rev_comp_seq[bindex]])
                    TAL_1 = ' '.join(TAL_1)
    
                binding_site.perfectTAL1 = TAL_1
    
    #Print output results to file: binding sites
    
    #filename = 'upload/'+ options.job + '_TALEN_pairs_all.txt'
    
    if options.outpath == 'NA':
      filename = options.outdir + options.job + options.outfile
    else:
      filename = options.outpath
    
    out = open(filename, 'w')
    table_ignores = []
    if not options.revcomp:
        table_ignores.append("Plus strand sequence")
    if len(table_ignores) > 0:
        out.write("table_ignores:" + string.join(table_ignores, ",") + "\n")
    
    u_bases = []
    
    if options.cupstream != 1:
        u_bases.append("T")
    
    if options.cupstream != 0:
        u_bases.append("C")
        
    out.write("options_used:" + ', '.join([
        "array_min = " + str(options.arraymin),
        "array_max = " + str(options.arraymax),
        "upstream_base = " + (" or ".join(u_bases)),
        ("No T at position 1" if options.t1 else ""),
        ("No A at position 1" if options.a2 else ""),
        ("Sites must end in a T" if options.tn else ""),
        ("Sites may not end in G/NN" if options.gn else ""),
        ("Base composition rules enforced" if options.comp else ""),
        ("Search reverse complement" if options.revcomp else ""),
    ]) + "\n")
    
    out.write('Sequence Name\tTAL start\tTAL length\tRVD sequence\tStrand\tTarget sequence\tPlus strand sequence\n')
    if len(gene_binding_sites.keys()) > 0:
        for gene in sorted(gene_binding_sites.keys()):
            for start_site in gene_binding_sites[gene].keys():
                for binding_site in gene_binding_sites[gene][start_site]:
                    if binding_site.is_plus:
                        out.write(gene.id + '\t' + str(binding_site.start1) + '\t' + str(len(binding_site.seq1)) + '\t' + binding_site.perfectTAL1 + '\t' +  'Plus' + '\t' + binding_site.upstream + " " + str(binding_site.seq1) + '\t' + binding_site.upstream + " " + str(binding_site.seq1) + '\n')
                    else:
                        out.write(gene.id + '\t' + str(binding_site.start1) + '\t' + str(len(binding_site.seq1)) + '\t' + binding_site.perfectTAL1 + '\t' +  'Minus' + '\t' + ("T" if binding_site.upstream == "A" else "C") + " " + str(binding_site.seq1.reverse_complement()) + '\t' + str(binding_site.seq1) + " " + binding_site.upstream + '\n')
    out.close()
    
    logger('Finished')