コード例 #1
0
def upstream_and_downstream_seq(args):
    chromosome = split_coords(args.coords)[0]
    start = str(split_coords(args.coords)[1])
    downstream = str(int(start)-1000)
    end = str(split_coords(args.coords.replace('"', ""))[2])
    upstream = str(int(end)+1000)

    #using the samtools faidx function to take the appropriate sequence from a reference genome
    downstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+downstream+"-"+start), generic_dna)

    upstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+end+"-"+upstream), generic_dna)

    # Selecting only the sequence and converting to uppercase
    downstream_seq = downstream_fa[(len(downstream_fa.split('\n')[0])):-1].upper()
    # Selecting only the sequence, converting to uppercase, reversing and then getting the complementary sequence
    reverse_compliment_upstream_seq = upstream_fa[(len(upstream_fa.split('\n')[0])):-1].upper().reverse_complement()

    # Making sequence records with ID header and sequence
    downstream_seq = SeqRecord(downstream_seq, id="downstream_sequence")
    reverse_compliment_upstream_seq = SeqRecord(reverse_compliment_upstream_seq, id="upstream_sequence")

    if os.path.isdir(args.directory+"tmp/") == False:
        os.mkdir(args.directory+"tmp/")

    # Writing sequences to fasta file
    downstream_outfile = open(os.path.join(args.directory+"tmp/", "downstream.fa"), "w")
    downstream_outfile.write(">"+str(downstream_seq.id) + "\n" + str(downstream_seq.seq))

    upstream_outfile = open(os.path.join(args.directory+"tmp/", "upstream.fa"), "w")
    upstream_outfile.write(">"+str(reverse_compliment_upstream_seq.id) + "\n" + str(reverse_compliment_upstream_seq.seq))
コード例 #2
0
def exonic_circRNA(exon_list, args):
    circRNA_seq = ""
    for exon in exon_list:
        exon_seq = Seq(pysam.faidx(args.G, exon), generic_dna)
        if args.s == "-":
            exon_seq = str(exon_seq[(
                len(exon_seq.split('\n')[0])):-1].upper().reverse_complement())
            circRNA_seq += exon_seq
        elif args.s == "+":
            exon_seq = str(exon_seq[(len(exon_seq.split('\n')[0])):-1].upper())
            circRNA_seq += exon_seq
    # Making a one line sequence
    circRNA_seq = circRNA_seq.replace("\n", "")

    return circRNA_seq
コード例 #3
0
def find_frame(s):
    # Finds longest ORF
    s = s.replace("-", "")
    seq1 = Seq(s, generic_dna).translate()  # translate in every frame
    seq2 = Seq(s[1:], generic_dna).translate()
    seq3 = Seq(s[2:], generic_dna).translate()
    L_seq1 = max([len(x)
                  for x in seq1.split("*")])  # find longest ORF in each frame
    L_seq2 = max([len(x) for x in seq2.split("*")])
    L_seq3 = max([len(x) for x in seq3.split("*")])
    Ls = [L_seq1, L_seq2, L_seq3]
    L_max = max(Ls)  # get longest ORF among all frames
    frames_max = [i for i, x in enumerate(Ls)
                  if x == L_max]  # get frame of longest ORF
    if len(frames_max) > 1:
        print "Warning: more than one reading frame had max length ORF"
    return frames_max[0]
コード例 #4
0
def upstream_and_downstream_seq(coords, out_path):
    chromosome = coords.split(":")[0]
    start = str(int(coords.split(":")[1].split("-")[0]))
    downstream = str(int(coords.split(":")[1].split("-")[0]) - 500)
    end = str(int(coords.split(":")[1].split("-")[1]))
    upstream = str(int(coords.split(":")[1].split("-")[1]) + 500)

    #using the samtools faidx function to take the appropriate sequence from a reference genome
    downstream_fa = Seq(
        pysam.faidx(
            "/Users/lachlan/Documents/Masters_Research_2019/Data/ICM/genome.fa",
            chromosome + ":" + downstream + "-" + start), generic_dna)

    upstream_fa = Seq(
        pysam.faidx(
            "/Users/lachlan/Documents/Masters_Research_2019/Data/ICM/genome.fa",
            chromosome + ":" + end + "-" + upstream), generic_dna)

    # Selecting only the sequence and converting to uppercase
    downstream_seq = downstream_fa[(
        len(downstream_fa.split('\n')[0])):-1].upper()
    # Selecting only the sequence, converting to uppercase, reversing and then getting the complementary sequence
    reverse_compliment_upstream_seq = upstream_fa[(
        len(upstream_fa.split('\n')[0])):-1].upper().reverse_complement()

    # Making sequence records with ID header and sequence
    downstream_seq = SeqRecord(downstream_seq, id="downstream_sequence")
    reverse_compliment_upstream_seq = SeqRecord(
        reverse_compliment_upstream_seq, id="upstream_sequence")

    # Writing sequences to fasta file
    downstream_outfile = open(os.path.join(out_path, "downstream.fa"), "w")
    downstream_outfile.write(">" + str(downstream_seq.id) + "\n" +
                             str(downstream_seq.seq))

    upstream_outfile = open(os.path.join(out_path, "upstream.fa"), "w")
    upstream_outfile.write(">" + str(reverse_compliment_upstream_seq.id) +
                           "\n" + str(reverse_compliment_upstream_seq.seq))
コード例 #5
0
def randomprotein(percentage):
    inp = percentage
    length = 10000

    gcpercent = float(inp) / 100
    atpercent = 1 - gcpercent
    gcthou = int(length * gcpercent)
    atthou = int(length * atpercent)

    seq = ""

    goodlen = True
    while goodlen:

        rand = random.uniform(0, 1)

        if (rand >= 0 and rand < gcpercent / 2):
            seq += "G"
        elif (rand >= gcpercent / 2 and rand < gcpercent):
            seq += "C"
        elif (rand >= gcpercent and rand < 1 - (atpercent / 2)):
            seq += "A"
        elif (rand >= 1 - (atpercent / 2) and rand <= 1):
            seq += "T"

        if (len(seq) == 10000):
            goodlen = False

    seq = Seq(seq)
    seq = seq.translate()
    split = seq.split("*")

    for i in range(0, len(split)):
        largest = ""
        if (len(split[i]) > len(largest)):
            largest = split[i]

    return (len(largest))
コード例 #6
0
dnaSeq = open("data\\dna_chromosome_1.seq", "r")
my_seq = dnaSeq.read()
print(my_seq)
RNA = Seq(my_seq).transcribe()

rev = RNA[::-1]
print(rev)
reverseRNASeq = open("reverse_RNAsequence_1.seq", "w")
reverseRNASeq.write(rev.__str__())
reverseRNASeq.close()

# Zad 6
mySeq = open("sampleData.txt", "r")
my_seq = mySeq.read()
#print(my_seq)
s, t = my_seq.split("\n")
print("s: " + s)
print("t: " + t)

for n in range(len(s) - len(t)):
    if t == s[n:(len(t) + n)]:
        print(n + 1)

# Zad 7
mySeq = open("seqSampleData.txt", "r")
my_seq = mySeq.read()

# RNA transcribe
RNA = Seq(my_seq).transcribe().__str__()

startCodon = "AUG"
コード例 #7
0
ファイル: parse-blast-hd.py プロジェクト: fmarletaz/Hox
    rawseq=retr_db(database, elt[0])
    clen=len(rawseq)
    scaf.write('>'+elt[0]+'_'+rec[2]+'\n'+rawseq+'\n')
    if sens=='+':
        start=sstart-3*(qstart-1) if sstart-3*(qstart-1)>1 else sstart
        end=send+3*(qlen-qend) if send+3*(qlen-qend)<clen else send  
    elif sens=='-':
        start=sstart-3*(qlen-qend) if sstart-3*(qlen-qend)>1 else sstart
        end=send+3*(qstart-1) if send+3*(qstart-1)<clen else send  
    
    print elt[0],elt[2],sens,sstart,send,qstart,qend,start,end,clen
    
    hdseq=Seq(rawseq[start-1:end],IUPAC.unambiguous_dna)
    if sens=='-':
        hdseq=hdseq.reverse_complement()
    hdseq=hdseq.split('N')[0]
    print hdseq.translate()
    #head='>'+prefix+'_'+elt[2].split('_')[1]+'-'+elt[0].split('|')[1]+'@'+'-'.join(map(str,elt[1]))+'\n'    
    head='>'+prefix+'_'+elt[2].split('_')[1]+'|ND'+elt[0].split('_')[1]+'@'+'-'.join(map(str,elt[1]))+'\n'
    prot.write(head+hdseq.translate().tostring()+'\n')
    nucl.write(head+hdseq.tostring()+'\n')

#+'-'.join(map(str,elt[1])







コード例 #8
0
    except:
        d = None

orf_sequence = defaultdict(list)

for rec in orfs:
    orf_sequence[str(rec.seq)].append(rec.id)

export = []
mapping = {}

count = 1
for seq in orf_sequence:
    id = 'orf_sequence_{}'.format(str(count))
    ids = orf_sequence[seq]

    for i in ids:
        mapping[i] = id

    description = ';'.join(ids)
    seq = Seq(''.join(seq.split('*')))
    record = SeqRecord(id=id, description=description, seq=seq)
    export.append(record)
    count += 1

SeqIO.write(export, path + '/nr_translated_pg_orfs.fasta', 'fasta')
outpath = path + '/id_mapping.json'
mapping = json.dumps(mapping)
with open(outpath, 'w') as f:
    f.write(mapping)