Exemple #1
0
def rc_regions(gb, choice='whole'):
    """
    Reverse and complement given region of sequence.
    Args:
        gb(Path or str): rotate_seq generated gb file
        choice(str): region to be processed, must be in 'LSC', 'IRa', 'SSC',
        'IRb', 'whole'.
    Return:
        new_file(Path): reverse-complemented fasta
    """
    # choices = ('LSC', 'IRa', 'SSC', 'IRb', 'whole')
    raw = SeqIO.read(gb, 'gb')
    data = {}
    new_seq = ''
    regions = get_regions(gb)
    for r in regions:
        data[r] = regions[r].extract(raw).seq
    if choice != 'whole':
        data[choice] = rc(regions[choice].extract(raw.seq))
        new_seq = data['LSC']
        for i in ['IRa', 'SSC', 'IRb']:
            new_seq += data[i]
    else:
        new_seq = rc(raw.seq)
    new_name = '_RC_' + raw.name
    new_file = gb.with_suffix('.rc.rc')
    with open(new_file, 'w') as _:
        _.write(f'>{new_name}\n')
        _.write(f'{new_seq}\n')
    return new_file
Exemple #2
0
def main():
    usage = "usage: %prog [options]"
    description = "blah blah blah"
    
    optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False)
    optparser.add_option("-h","--help",action="help",help="Show this help message and exit.")
    optparser.add_option("-i","--ifile",dest="ifile",type="string",
                         help="input files, if you give a pattern for files, please use \" to surround the pattern string")
    optparser.add_option("-w","--window",dest="window",type="int",
                         default=1000,help="window to extract a sample sequence, default:1000")
    optparser.add_option("-s","--sample",dest="sample",type="int",
                         default=36,help="size of a sample sequence,default=36")
    optparser.add_option("-f","--frag",dest="frag",type="int",
                         help="if pair-end calculation is needed, give the fragment size")
    optparser.add_option("-o","--ofile",dest="ofile",
                         help="output file") 
    (options,args) = optparser.parse_args()
    if not options.ifile or not options.window or not options.sample or not options.ofile:
        optparser.print_help()
        sys.exit(1)
    

    ohd = open(options.ofile,"w")
    
    files = glob(options.ifile)
    if not files:
        sys.stderr.write("no file found: %s\n" % (options.ifile))
        sys.exit(1)

    m = 0
    for f in files:
        sys.stdout.write("%s ... \n" % (f))
        sys.stdout.flush()
        fahd = open(f,"r")
        record = SeqIO.parse(fahd,"fasta")
        for i in record:
            n = len(i.seq)/1000
            for l in range(0,n-1):
                s = l*1000+randint(0,999)
                if options.frag:
                    slice = i.seq[s:s+options.frag].tostring().upper()
                    if slice.find("N") == -1:
                        m+=1
                        ohd.write("%s\t%d\n" % (slice[:options.sample],m))
                        m+=1
                        ohd.write("%s\t%d\n" % (rc(slice[-1*options.sample:]),m))
                        #ohd.write("> slice%d_left\n%s\n" % (m,slice[:options.sample]))
                        #ohd.write("> slice%d_right\n%s\n" % (m,rc(slice[-1*options.sample:])))
                    else:
                        slice = i.seq[s:s+options.sample].tostring().upper()
                        if slice.find("N") == -1:
                            m+=1
                            ohd.write("%s\t%d\n" % (slice,m))
                            #ohd.write("> slice%d\n%s\n" % (m,slice))
        fahd.close()
    ohd.close()
Exemple #3
0
def cseguid(seq):
    '''Returns the cSEGUID for the sequence. The cSEGUID is the url safe SEGUID checksum
    calculated for the lexicographically minimal string rotation of a DNA sequence.
    Only defined for circular sequences.
    '''
    from Bio.Seq import reverse_complement as rc
    return pretty_string(
        seguid(
            min(SmallestRotation(str(seq).upper()),
                SmallestRotation(str(rc(seq)).upper()))))
Exemple #4
0
def _rc(record):
    if isinstance(record, str):
        return rc(record)
    elif isinstance(record, Seq):
        return record.reverse_complement()
    elif isinstance(record, SeqRecord):
        return SeqRecord(record.seq.reverse_complement(),
                         id=record.id,
                         name=record.name,
                         description=record.description)
    else:
        raise ValueError(
            'record must be one of str, Bio.Seq, or Bio.SeqRecord')
Exemple #5
0
 def primer_dict(self):
     if self._primer_dict is None:
         from Bio.Seq import reverse_complement as rc
         self._primer_dict = {'F1': ['CTCAATAAAGCTTGCCTTGAGTGC', rc('ACTGTATCATCTGCTCCTGTRTCT')],
                'F2': ['AAATTGCAGGGCYCCTAG', rc('CTRTTAGCTGCCCCATCTACATAG')],
                'F3B': ['CACACTAATGATGTAARACARTTAACAG', rc('GGGATGTGTACTTCTGAACTTAYTYTTGG')],
                'F4': ['CGGGTTTATTWCAGRGACAGCAGA', rc('GGGGTTAAYTTTACACATGGYTTTA')],
                'F5a': ['GGCATYTCCTATGGCAGGAAGAAG', rc('GTGGTGCARATGAGTTTTCCAGAGCA')],
                'F6': ['GGGTTCTTRGGARCAGCAGGAAG', rc('ATTGAGGCTTAAGCAGTGGGTTC')],}
     return self._primer_dict
Exemple #6
0
def _rc(record):
    if isinstance(record, str):
        return rc(record)
    elif isinstance(record, Seq):
        return record.reverse_complement()
    elif isinstance(record, SeqRecord):
        return SeqRecord(
            record.seq.reverse_complement(),
            id=record.id,
            name=record.name,
            description=record.description
            )
    else:
        raise ValueError(
            'record must be one of str, Bio.Seq, or Bio.SeqRecord'
            )
Exemple #7
0
def reads_to_seqrecord(reads):
    '''Build a FASTQ record out of BAM reads
    
    Note: copied from Bio.SeqIO.QualityIO.py
    '''
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord

    # Precompute conversion table
    SANGER_SCORE_OFFSET = ord("!")
    q_mapping = dict()
    for letter in xrange(0, 255):
        q_mapping[chr(letter)] = letter - SANGER_SCORE_OFFSET
    
    seqs = []
    for read in reads:
        # Get the sequence first
        descr = read.qname
        id = read.qname
        name = id
        from Bio.Alphabet import IUPAC
        from Bio.Alphabet import IUPAC
        from Bio.Seq import reverse_complement as rc
        if not read.is_reverse:
            record = SeqRecord(Seq(read.seq, IUPAC.ambiguous_dna),
                           id=id, name=name, description=descr)
        if read.is_reverse:
            record = SeqRecord(Seq(rc(read.seq), IUPAC.ambiguous_dna),
                           id=id, name=name, description=descr)
    
        # Get the qualities second
        qualities = [q_mapping[letter] for letter in read.qual]
        if qualities and (min(qualities) < 0 or max(qualities) > 93):
            raise ValueError("Invalid character in quality string")
        dict.__setitem__(record._per_letter_annotations,
                         "phred_quality", qualities)

        seqs.append(record)

    return (seqs)
Exemple #8
0
fragments_exons = {g: frs for (g, frs) in fragments_genes.iteritems()
                   if g not in ('tat', 'rev')}
fragments_exons['tat1'] = fragments_genes['tat'][0]
fragments_exons['tat2'] = fragments_genes['tat'][1]
fragments_exons['rev1'] = fragments_genes['rev'][0]
fragments_exons['rev2'] = fragments_genes['rev'][1]

fragments_RNA_structures = {'RRE': ['F5', 'F6'],
                            "LTR5'": ['F1'],
                            "LTR3'": ['F6']}

fragments_other = {'env peptide': ['F4', 'F5'],
                   'psi': ['F1']}

# Note: the reverse primers get reverse complemented (so everything is positive sense)
primers_inner = {'F1': ['AAGTAGTGTGTGCCCGTCTGT', rc('TGCCAAAGAGTGATYTGAGGG')],
                 'F2': ['GGGCTGTTGGARATGTGG', rc('ACAAACTCCCAYTCAGGAATCCA')],
                 'F3a': ['GAAAGCATAGTRATATGGGGAAA', rc('CACCTGCCATCTGTTTTCCATA')],
                 'F3b': ['GAAAGCATAGTRATATGGGGAAA', rc('CACCTGCCATCTGTTTTCCATA')],
                 'F3B': ['GAAAGCATAGTRATATGGGGAAA', rc('CACCTGCCATCTGTTTTCCATA')],
                 'F4': ['TGGAAAGGTGAAGGGGCAG', rc('GTACACAGGCATGTGTRGCCCA')],
                 'F5a': ['TAAGAGAAAGAGCAGAAGACAGTGG', rc('CCAAATYCCYAGGAGCTGTTGATC')],
                 'F5b': ['TCTATTATGGRGTACCTGTRTGG', rc('CCAAATYCCYAGGAGCTGTTG')],
                 'F6': ['CAGGAAGCACTATGGGCGC', rc('CCAGAGAGCTCCCAGG')],
                }

primers_outer = {'F1': ['CTCAATAAAGCTTGCCTTGAGTGC', rc('ACTGTATCATCTGCTCCTGTRTCT')],
                 'F2': ['AAATTGCAGGGCYCCTAG', rc('CTRTTAGCTGCCCCATCTACATAG')],
                 'F3a': ['CACACTAATGATGTAARACARTTAACAG', rc('TTCCATGTTYTAATCCTCATCCTGTCTAC')],
                 # NOTE: F3b and F3B are actually the same, but I forgot about the last G for
                 # the biggest part of the dataset. It's not a huge problem because that G is
Exemple #9
0
# Edges of RNA structures
RRE_edges = ['AGGAGCTATGTTCCTTGGGT', 'ACCTAAGGGATACACAGCTCCT']
LTR5 = [None, 'CTCTAGCA']
LTR3 = ['TGGANGGGNTANTTNNNTC', None]

RNA_structure_edges = {'RRE': RRE_edges, "LTR5'": LTR5, "LTR3'": LTR3}

# Edges of other regions
env_peptide_edges = ['ATGAGAGTGAAGGAGAA', 'GCTCCTTGGGATGTTGATGATCTGTAGTGCT']
psi_element = ['CTCGGCTTGCT', 'AGCGGAGGCTAG']
# V1, V3, V4, and V5 actually start INSIDE these primers
V1_edges = [
    'AANCCATGTGTAAAANTAACNCCACTNTGTGTNANTTTANAN',
    'TGCTCTTTCAATNTCANCNCANNNNTAANA'
]
V3_edges = ['ACAATGYACACATGGAATTARGCCA', rc('AGAAAAATTCYCCTCYACAATTAAA')]
V4_edges = [
    'TTGTAANGCACANTTTTAATTGTGGAGGGGAATTTTTCTAC',
    'AGAATAANACAAATTNTAAACANGTGGCAGNAAGTAGGA'
]
V5_edges = [
    'ATCAAATATTACAGGGNTNNTAACAAGAGATGGNGGN',
    'GNAGGAGGANATATGANGGANAATTGGAGAAGT'
]
# V2 is particular: from the left it starts right there, from the right it ends
# INSIDE this primer
V2_edges = [
    'TGCTCTTTCAATNTCANCNCANNNNTAANA',
    'AANACCTCANTCATTACACANGCNTGTCCAAANNTATCCTTTGANCCAATTCC'
]
gp120_noVloops_edges = [
Exemple #10
0
fragments_exons['tat2'] = fragments_genes['tat'][1]
fragments_exons['rev1'] = fragments_genes['rev'][0]
fragments_exons['rev2'] = fragments_genes['rev'][1]

fragments_RNA_structures = {
    'RRE': ['F5', 'F6'],
    "LTR5'": ['F1'],
    "LTR3'": ['F6']
}

fragments_other = {'env peptide': ['F4', 'F5'], 'psi': ['F1']}

# Note: the reverse primers get reverse complemented (so everything is positive sense)
primers_inner = {
    'F1': ['AAGTAGTGTGTGCCCGTCTGT',
           rc('TGCCAAAGAGTGATYTGAGGG')],
    'F2': ['GGGCTGTTGGARATGTGG',
           rc('ACAAACTCCCAYTCAGGAATCCA')],
    'F3a': ['GAAAGCATAGTRATATGGGGAAA',
            rc('CACCTGCCATCTGTTTTCCATA')],
    'F3b': ['GAAAGCATAGTRATATGGGGAAA',
            rc('CACCTGCCATCTGTTTTCCATA')],
    'F3B': ['GAAAGCATAGTRATATGGGGAAA',
            rc('CACCTGCCATCTGTTTTCCATA')],
    'F4': ['TGGAAAGGTGAAGGGGCAG',
           rc('GTACACAGGCATGTGTRGCCCA')],
    'F5a': ['TAAGAGAAAGAGCAGAAGACAGTGG',
            rc('CCAAATYCCYAGGAGCTGTTGATC')],
    'F5b': ['TCTATTATGGRGTACCTGTRTGG',
            rc('CCAAATYCCYAGGAGCTGTTG')],
    'F6': ['CAGGAAGCACTATGGGCGC',
RRE_edges = ['AGGAGCTATGTTCCTTGGGT', 'ACCTAAGGGATACACAGCTCCT']
LTR5 = [None, 'CTCTAGCA']
LTR3 = ['TGGANGGGNTANTTNNNTC', None]

RNA_structure_edges = {'RRE': RRE_edges,
                       "LTR5'": LTR5,
                       "LTR3'": LTR3}


# Edges of other regions
env_peptide_edges = ['ATGAGAGTGAAGGAGAA', 'GCTCCTTGGGATGTTGATGATCTGTAGTGCT']
psi_element = ['CTCGGCTTGCT', 'AGCGGAGGCTAG']
# V1, V3, V4, and V5 actually start INSIDE these primers
V1_edges = ['AANCCATGTGTAAAANTAACNCCACTNTGTGTNANTTTANAN',
            'TGCTCTTTCAATNTCANCNCANNNNTAANA']
V3_edges = ['ACAATGYACACATGGAATTARGCCA', rc('AGAAAAATTCYCCTCYACAATTAAA')]
V4_edges = ['TTGTAANGCACANTTTTAATTGTGGAGGGGAATTTTTCTAC',
            'AGAATAANACAAATTNTAAACANGTGGCAGNAAGTAGGA']
V5_edges = ['ATCAAATATTACAGGGNTNNTAACAAGAGATGGNGGN', 'GNAGGAGGANATATGANGGANAATTGGAGAAGT']
# V2 is particular: from the left it starts right there, from the right it ends
# INSIDE this primer
V2_edges = ['TGCTCTTTCAATNTCANCNCANNNNTAANA',
            'AANACCTCANTCATTACACANGCNTGTCCAAANNTATCCTTTGANCCAATTCC']
gp120_noVloops_edges = ['NNAGAANANTTGTGGGTCACAGTCTATTATGGGGTACCT',
                        'AAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAG',
                        'AANACCTCANTCATTACACANGCNTGTCCAAANNTATCCTTTGANCCAATTCC',
                        'ACAGTACAATGTACACATGGAATTANNCCA',
                        'TTTAATTGTGGAGGGGAATTTTTCT',
                        'GGGGAATTTTTCTAC',
                        'GAATAAAACAAATTNTAAACANGTGGCAGAAAGTAGGAAAAGCA',
                        'ATTACAGGGNTNNTATTAACAAGAGATGGTGGT',
Exemple #12
0
def cseguid(seq):
    '''Returns the cSEGUID for the sequence. The cSEGUID is the url safe SEGUID checksum
    calculated for the lexicographically minimal string rotation of a DNA sequence.
    Only defined for circular sequences.
    '''
    from Bio.Seq import reverse_complement as rc
    return pretty_string( seguid( min( SmallestRotation(str(seq).upper()), SmallestRotation(str(rc(seq)).upper()))))
Exemple #13
0
def copy_features(source_sr, target_sr, limit = 10):
    '''This function tries to copy all features in source_seq and copy
    them to target_seq. Source_sr and target_sr are objects with
    a features property, such as Dseqrecord or Biopython SeqRecord.

    Parameters
    ----------

    source_seq : SeqRecord or Dseqrecord
        The sequence to copy features from

    target_seq : SeqRecord or Dseqrecord
        The sequence to copy features to

    Returns
    -------
    bool : True
        This function acts on target_seq in place.
        No data is returned.


    '''
    import re
    from Bio.Seq import reverse_complement as rc
    target_length    = len(target_sr)
    target_string    = str(target_sr.seq).upper()

    try:
        circular = bool(target_sr.circular)
    except AttributeError:
        circular=False

    newfeatures=[]

    trgt_string = target_string
    trgt_string_rc = rc(trgt_string)

    for feature in [f for f in source_sr.features if len(f)>limit]:
        fsr            = feature.extract(source_sr).upper()
        featurelength  = 0# len(fsr)

        if circular:
            trgt_string = target_string+target_string[:featurelength]
            trgt_string_rc = rc(trgt_string)

        positions = (
        [(m.start(), m.end(), 1,) for m in re.finditer(str(fsr.seq),trgt_string)]
        +
        [(len(trgt_string_rc)-m.end(),len(trgt_string_rc)-m.start(),-1,)
                      for m in re.finditer(str(fsr.seq),trgt_string_rc)])

        for begin, end, strand in positions:
            if circular and begin<target_length<end:
                end = end-len(
                              target_sr)
                sf1 = SeqFeature(FeatureLocation(begin, trgt_length),
                                 type=feature.type,
                                 location_operator=feature.location_operator,
                                 strand=strand,
                                 id=feature.id,
                                 qualifiers=feature.qualifiers,
                                 sub_features=None,)
                sf2 = SeqFeature(FeatureLocation(0, end),
                                 type=feature.type,
                                 location_operator=feature.location_operator,
                                 strand=strand,
                                 id=feature.id,
                                 qualifiers=feature.qualifiers,
                                 sub_features=None,)
                nf =  SeqFeature(FeatureLocation(begin, end),
                                 type=feature.type,
                                 location_operator="join",
                                 strand=strand,
                                 id=feature.id,
                                 qualifiers=feature.qualifiers,
                                 sub_features=[sf1,sf2],)
            else:
                nf = SeqFeature(FeatureLocation(begin,end),
                     type=feature.type,
                     location_operator=feature.location_operator,
                     strand=strand,
                     id=feature.id,
                     qualifiers=feature.qualifiers,
                     sub_features=None)
            newfeatures.append(nf)
    target_sr.features.extend(newfeatures)
    return True
Exemple #14
0
def filter_tendem(gen):
    for code in gen:
        if max_rep(code) > 2:
            continue
        else:
            yield code


if __name__ == "__main__":
    codes = barcode_gen(8)
    codes = filter_tendem(codes)
    acc = KeepDist(3)
    codes = acc.filter(codes)
    codes = islice(codes, 96)
    #print("index\tbarcode\tlinker_F\tlinker_R")
    for i, code in enumerate(codes):
        a1 = "GTCGGA" + code + "G"
        a2 = rc(a1)[::-1]
        a1 = "TA" + a1
        a2 = a2 + "GATC"
        a2 = a2[::-1]
        #items = [str(i), code, a1, a2]
        #print("\t".join(items))
        if i < 10:
            idx = '0' + str(i)
        else:
            idx = str(i)
        print(f"MseI-linker-{idx}-F\t{a1}")
        print(f"MseI-linker-{idx}-R\t{a2}")
    variants_in_gene = {}
    pb = ProgressBar(maxval=UnknownLength)
    for line in pb(open(variants)):
        chrom, pos, _, variant = line.split()
        pos = int(pos)
        for gene_chrom, low, high in gene_coords:
            if chrom == gene_chrom and low <= pos <= high:
                variant = variant.split('|')
                mel, sim = variant
                #assert chroms[chrom][pos] == 'N'
                chroms[chrom][pos] = ambigs[tuple(sorted(variant))]
                mel_copy[chrom][pos] = mel
                sim_copy[chrom][pos] = sim
                assert mel_copy[chrom][pos] != sim_copy[chrom][pos]
                if gene_strand == '-':
                    variant = (rc(variant[0]),
                                rc(variant[1]))
                variants_in_gene[pos] = variant
                break


    # Build the hunchback transcript
    gene_str = []
    mel_str = []
    sim_str = []
    gene_bps = []
    for chrom, start, stop in gene_coords:
        gene_str.append(str(chroms[chrom][start:stop]))
        mel_str.append(str(mel_copy[chrom][start:stop]))
        sim_str.append(str(sim_copy[chrom][start:stop]))
        gene_bps.extend(range(start, stop))