def compareGTFs(truthGTF, compGTF): transcriptsTruth = dict() with open(truthGTF, 'r') as tsv: for line in tsv: row = line.strip().split('\t') if len(row) < 5: continue covIndex = row[8].find('cov') covStart = row[8].find('"', covIndex) + 1 covEnd = row[8].find('"', covStart) cov = float(row[8][covStart:covEnd]) transcriptIdIndex = row[8].find('transcript_id') transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1 transcriptIdEnd = row[8].find('"', transcriptIdStart) transcriptId = row[8][transcriptIdStart:transcriptIdEnd] if row[2] == 'transcript': transcriptsTruth[transcriptId] = Transcript( row[0], int(row[3]), int(row[4]), cov, transcriptId) elif row[2] == 'exon': transcriptsTruth[transcriptId].exons.append( (int(row[3]), int(row[4]))) transcriptsTruth = transcriptsTruth.values() transcriptsComp = dict() with open(compGTF, 'r') as tsv: for line in tsv: row = line.strip().split('\t') if len(row) < 5: continue covIndex = row[8].find('cov') covStart = row[8].find('"', covIndex) + 1 covEnd = row[8].find('"', covStart) cov = float(row[8][covStart:covEnd]) transcriptIdIndex = row[8].find('transcript_id') transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1 transcriptIdEnd = row[8].find('"', transcriptIdStart) transcriptId = row[8][transcriptIdStart:transcriptIdEnd] if row[2] == 'transcript': transcriptsComp[transcriptId] = Transcript( row[0], int(row[3]), int(row[4]), cov, transcriptId) elif row[2] == 'exon': transcriptsComp[transcriptId].exons.append( (int(row[3]), int(row[4]))) transcriptsComp = transcriptsComp.values() compareAll(transcriptsTruth, transcriptsComp)
def homework_html_to_LaTeX(file_in, soln=False): global week_number the_homework = Transcript(file_in) the_homework_text = the_homework.text soup = BeautifulSoup(the_homework_text, 'html.parser') print('souped up!') if not soln: file_name = file_in.strip().split('/')[9][0:-8] + "LaTeXnosoln.txt" file_out = open( 'C:/Users/Justin Yan/Documents/Development/Python/AoPSCleanScript/AoPSCleanScript/homework_LaTeX/' + file_name, 'w') print('File opened for writing') else: file_name = file_in.strip().split('/')[9][0:-8] + "LaTeXwithsoln.txt" file_out = open( 'C:/Users/Justin Yan/Documents/Development/Python/AoPSCleanScript/AoPSCleanScript/homework_LaTeX/' + file_name, 'w') print('File opened for writing') week_number = file_name.split('HTML')[0] transcribe_preamble(soup, file_out) #process the problem body transcribe_problems(soup, file_out, soln) file_out.write('\end{document}')
def processSAM(sam, genome): # This function extracts the SAM header (because we'll need that later) and creates a Transcript object for every sam transcript. # Transcripts are returned two separate lists: one canonical and one noncanonical. header = "" canTranscripts = {} noncanTranscripts = {} #unmodifiedTranscripts = {} # Place to put transcripts that didn't map or multimapped. with open(sam, 'r') as f: for line in f: line = line.strip() if line.startswith("@"): header = header + line + "\n" continue t = Transcript(line, genome) #print Transcript.getNMandMDFlags(t, genome) # Filter out transcripts that are multimapping if int(t.FLAG) > 16: continue #unmodifiedTranscripts[t.QNAME] = t # Skip unmapped transcripts altogether if t.CHROM == "*": continue if t.isCanonical == True: canTranscripts[t.QNAME] = t else: noncanTranscripts[t.QNAME] = t return header, canTranscripts, noncanTranscripts #, unmodifiedTranscripts
def check_level(self, line): if int(line.attrs['level']) < int(self.transcript.attrs['level']): self.transcript = Transcript(line) self.type = 'level' return True return False
def parsePro(filename): ''' Return a dictionary with transcript id (e.g. 0300689) pointing to coverage level ''' threshold = 0.00005 transcripts = dict() with open(filename, 'r') as f: for line in f: row = line.strip().split('\t') if len(row) < 8: continue tag = row[1] sep1 = row[0].find(':') sep2 = row[0].find('-', sep1) sep3 = row[0].find('W', sep2) chrom = row[0][:sep1] start = int(row[0][sep1 + 1:sep2]) end = int(row[0][sep2 + 1:sep3]) fraction = float(row[8]) #cov = float(row[7]) #if cov > 0: # fraction = float(row[11]) / cov if fraction > threshold: transcripts[tag] = Transcript(chrom, start, end, fraction, tag) return transcripts
def check_length(self, line): alt_transcript = Transcript(line) if alt_transcript.length > self.transcript.length: self.type = 'length' self.transcript = alt_transcript return True return False
def load_assembled_transcripts(filename, ref_genome): datafile = open(filename) data = datafile.readlines() list_transcripts = [] for line in data: column = line.split('\t') if column[2] == "transcript": try: list_transcripts.append(itranscript) except: pass transcript_info = column[8].split(' ') transcript_id = transcript_info[3].strip() gene_id = transcript_info[1].strip() chromosome = column[0] if chromosome in ref_genome.chromosomes_dict: fpkm = transcript_info[7].strip(";").strip("\"") sign = column[6].strip() itranscript = Transcript(transcript_id, gene_id, chromosome, fpkm, sign) else: pass elif column[2] == "exon": if column[0] in ref_genome.chromosomes_dict: transcript_info = column[8].split(' ') exon_start = int(column[3]) exon_end = int(column[4]) if itranscript.id == transcript_info[3]: itranscript.add_exon([exon_start, exon_end], ref_genome.chromosomes_dict[itranscript.chromosome][exon_start - 1:exon_end]) else: print 'WARNING ', transcript_info, itranscript.id else: pass return list_transcripts
def main(): uiRoot = tkinter.Tk() uiRoot.configure(background="black") textView = tkinter.Text(uiRoot, font=('Tiresias', 21)) textView.configure(background='black') textView.tag_config('unstable', foreground='gray') textView.tag_config('stable', foreground='white') language_code = 'nl-NL' # a BCP-47 language tag model = Transcript() service = TranscriptionService(language_code, model) def close_window(): service.stop() uiRoot.destroy() uiRoot.protocol("WM_DELETE_WINDOW", close_window) textView.after(50, updateUI, textView, model) textView.pack() stopButton = tkinter.Button(uiRoot) buttonDecoration = Switch(stopButton, service) stopButton.pack(fill=tkinter.X) uiRoot.mainloop()
def compareGTFs(truthGTF, compGTF): transcriptsTruth = dict() with open(truthGTF, 'r') as tsv: for line in tsv: row = line.strip().split('\t') if len(row) < 5: continue transcriptIdIndex = row[8].find('transcript_id') transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1 transcriptIdEnd = row[8].find('"', transcriptIdStart) transcriptId = row[8][transcriptIdStart:transcriptIdEnd] print(row[2]) if row[2] == 'transcript': print('Found transcript ' + str(transcriptId)) transcriptsTruth[transcriptId] = Transcript( row[0], int(row[3]), int(row[4]), 1, transcriptId) elif row[2] == 'exon' and transcriptId in transcriptsTruth: transcriptsTruth[transcriptId].exons.append( (int(row[3]), int(row[4]))) transcriptsTruth = transcriptsTruth.values() transcriptsComp = dict() with open(compGTF, 'r') as tsv: for line in tsv: row = line.strip().split('\t') if len(row) < 5: continue transcriptIdIndex = row[8].find('transcript_id') transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1 transcriptIdEnd = row[8].find('"', transcriptIdStart) transcriptId = row[8][transcriptIdStart:transcriptIdEnd] if row[2] == 'transcript': transcriptsComp[transcriptId] = Transcript( row[0], int(row[3]), int(row[4]), 1, transcriptId) elif row[2] == 'exon' and transcriptId in transcriptsComp: transcriptsComp[transcriptId].exons.append( (int(row[3]), int(row[4]))) transcriptsComp = transcriptsComp.values() compareAll(transcriptsTruth, transcriptsComp)
def use_the_non_NA_transcript_supported(self, line): if line.attrs['transcript_support_level'] == 'NA': return True if self.transcript.attrs['transcript_support_level'] == 'NA': self.transcript = Transcript(line) self.type = 'transcript_support_level' return True return False
def check_MANE_dataset(self, line): if 'MANE_Select' in self.transcript.attrs['tags']: self.type = 'MANE_Select' return True elif 'MANE_Select' in line.attrs['tags']: self.transcript = Transcript(line) self.type = 'MANE_Select' return True return False
def check_first_transcript(self, line): if "CCDS" in line.attrs[ 'tags']: #check that it's a member of the consensus CDS gene set self.transcript = Transcript(line) if 'MANE_Select' in self.transcript.attrs[ 'tags']: #the transcript belongs to the MANE Select data set self.type = 'MANE_Select' else: self.type = 'only_transcript' else: self.type = 'one_rejected_transcript'
def check_support_level(self, line): line_transcript_support_level = int( line.attrs['transcript_support_level']) original_trnascript_support_level = int( self.transcript.attrs['transcript_support_level']) if line_transcript_support_level < original_trnascript_support_level: self.transcript = Transcript(line) self.type = 'transcript_support_level' return True return False
def check_CCDS(self, line): if "CCDS" in line.attrs[ 'tags'] and "CCDS" not in self.transcript.attrs['tags']: self.transcript = Transcript(line) self.type = 'CCDS' return True elif "CCDS" not in line.attrs[ 'tags'] and "CCDS" in self.transcript.attrs['tags']: return True return False
def compareGTFs(proFile, truthGTF, compGTF): # file 1 is a .pro file output by flux transcriptsTruth = parsePro(proFile) with open(truthGTF, 'r') as tsv: for line in tsv: row = line.strip().split('\t') if len(row) < 5: continue transcriptIdIndex = row[8].find('transcript_id') transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1 transcriptIdEnd = row[8].find('"', transcriptIdStart) transcriptId = row[8][transcriptIdStart:transcriptIdEnd] #if row[2] == 'transcript': # transcriptsTruth[transcriptId] = Transcript(row[0], int(row[3]), int(row[4]), transcriptCovs[transcriptId]) #if row[1] == 'protein_coding' and row[2] == 'exon' and transcriptId in transcriptsTruth: if row[2] == 'exon' and transcriptId in transcriptsTruth: transcriptsTruth[transcriptId].exons.append( (int(row[3]), int(row[4]))) transcriptsTruth = transcriptsTruth.values() transcriptsComp = dict() with open(compGTF, 'r') as tsv: for line in tsv: row = line.strip().split('\t') if len(row) < 5: continue covIndex = row[8].find('cov') covStart = row[8].find('"', covIndex) + 1 covEnd = row[8].find('"', covStart) cov = float(row[8][covStart:covEnd]) transcriptIdIndex = row[8].find('transcript_id') transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1 transcriptIdEnd = row[8].find('"', transcriptIdStart) transcriptId = row[8][transcriptIdStart:transcriptIdEnd] if row[2] == 'transcript': transcriptsComp[transcriptId] = Transcript( row[0], int(row[3]), int(row[4]), cov, transcriptId) elif row[2] == 'exon': transcriptsComp[transcriptId].exons.append( (int(row[3]), int(row[4]))) transcriptsComp = transcriptsComp.values() compareAll(transcriptsTruth, transcriptsComp)
def build_merged_transcript(gene_id, clustered_transcripts): # find hte transcript bounds start, stop = 1e20, 0 for transcript in clustered_transcripts: start = min(start, transcript.exons[0][0]) stop = max(stop, transcript.exons[-1][-1]) # merge the promoters try: new_promoter = (min(t.promoter[0] for t in clustered_transcripts if t.promoter != None), max(t.promoter[1] for t in clustered_transcripts if t.promoter != None)) except ValueError: new_promoter = None # merge the polyas try: new_polya = (min(t.polya_region[0] for t in clustered_transcripts if t.polya_region != None), max(t.polya_region[1] for t in clustered_transcripts if t.polya_region != None)) except ValueError: new_polya = None # choose a tempalte transcript, and make sure that all of the # clustered transcripts have the same internal structure ( # this should be guaranteed by the calling function ) bt = clustered_transcripts[0] assert all(t.IB_key() == bt.IB_key() for t in clustered_transcripts) new_exons = list(bt.exons) new_exons[0] = (start, new_exons[0][1]) new_exons[-1] = (new_exons[-1][0], stop) # choose a random id - this should be renamed in the next step new_trans_id = gene_id + "_RNDM_%i" % random.randrange(1e9) new_transcript = Transcript(new_trans_id, bt.chrm, bt.strand, new_exons, bt.cds_region, gene_id, name=bt.name, gene_name=bt.gene_name, promoter=new_promoter, polya_region=new_polya) return new_transcript
def build_gene(elements, fasta=None, ref_genes=None): gene_min = min( min(e) for e in chain(elements.tss_exons, elements.tes_exons, elements.se_transcripts)) gene_max = max( max(e) for e in chain(elements.tss_exons, elements.tes_exons, elements.se_transcripts)) transcripts = [] for i, exons in enumerate( build_transcripts_from_elements(elements.tss_exons, elements.internal_exons, elements.tes_exons, elements.se_transcripts, elements.introns, elements.strand)): transcript = Transcript("%s_%i" % (elements.id, i), elements.chrm, elements.strand, exons, cds_region=None, gene_id=elements.id) transcript.promoter = find_matching_promoter_for_transcript( transcript, elements.promoter) transcript.polya_region = find_matching_polya_region_for_transcript( transcript, elements.polyas) transcripts.append(transcript) if len(transcripts) == 0: return None gene = Gene(elements.id, elements.id, elements.chrm, elements.strand, gene_min, gene_max, transcripts) if fasta != None: gene.transcripts = find_cds_for_gene(gene, fasta, only_longest_orf=True) if ref_genes != None: gene = rename_transcripts(gene, ref_genes) return gene
def lect_to_TeX(args): # file to be read file_in = args.file_in file_out = args.file_out image_path = args.image_out file_name = args.file_name #instantiate Transcript object the_transcript = Transcript(file_in) #access string instance var containing HTML text transcript_text = the_transcript.text #BeautifulSoup object allows easier traverse of HTML text soup = BeautifulSoup(transcript_text, 'html.parser') O = open(file_out + file_name, 'w') transcribe_preamble(soup, O, image_path) transcribe_msgs(soup, O, image_path) O.write(r'\end{document}') counter = 0
def cli(): print(f'###########################\n' '# GPA Calculator v0.3 #\n' '# Developed by Daanish KS #\n' '###########################\n') session = PromptSession() # Enables file path history for convenience while True: csv_file = session.prompt('Transcript CSV file path: ', completer=file_completion(), validator=file_validation(), validate_while_typing=True) x = Transcript(csv_file) file_request = prompt('Write GPA report to file [y/n]? ', validator=yes_no_validation(), validate_while_typing=True) if file_request in {'Y', 'y', 'YES', 'Yes', 'yes'}: report_type = prompt('JSON [1] or YAML [2]? ', validator=report_type_validation(), validate_while_typing=True) if report_type == '1': x.gpa_report_to_file(file_path='gpa_report.json') if report_type == '2': x.gpa_report_to_file(file_path='gpa_report.yaml') print() yaml.dump(x.gpa_report(round_place=3), sys.stdout) print() repeat_request = prompt('Continue [y/n]? ', validator=yes_no_validation(), validate_while_typing=True) if repeat_request in {'N', 'n', 'NO', 'No', 'no'}: break else: print(f'\n-------------------------\n')
def __init__(self, game, x, y): self.groups = game.all_sprites pg.sprite.Sprite.__init__(self, self.groups) self.game = game self.images = {'normal': pg.image.load(path.join(game.img_folder, "apple_64px.png")).convert_alpha(), \ 'blink': pg.image.load(path.join(game.img_folder, "apple_64px_blink.png")).convert_alpha(), \ 'wink': pg.image.load(path.join(game.img_folder, "apple_64px_wink.png")).convert_alpha()} self.blinks = False self.blink_time = .25 self.staring_time = 3 self.start_time = time.time() self.image = self.images['normal'] self.rect = self.image.get_rect() self.rect.center = (x, y) self.hit_rect = self.rect self.hit_rect.center = self.rect.center self.vel = vec(0, 0) self.position = vec(x, y) self.dest = vec_dest(x, y) self.previous_pos = vec_prev(x, y) self.instruction = "" self.orientation = "front" # left, right, front, back self.name = "Young Apple" self.silence_responses = ["can you please say that again?", "oops, I missed that. say again?", "I heard *silence*", "repeat again, please?", "could you say again?", "I didn't hear that, try again?", "I heard *silence*"] self.knowledge = Knowledge(self) self.transcript = Transcript() # Working memory properties self.recognized = [] self.actions = [] # current, complete list of action sequences e.g. [[1],[[0],[2]]] self.input_to_actions = [] self.action_queue = [] # remaining actions to be completed self.current_action = [] self.key_used = "" #self.responses = [] self.response = ""
def test_raises_no_mapped_segments(alignments): with pytest.raises(NoMappedSegmentsError): Transcript(alignments, DEFAULT_SKIP, DEFAULT_MAP)
def test_NM_001456_only(flna_annotations, args): transcript = Transcript(*args['transcript_args']) assert flna_annotations.get_annotations( transcript, args['junction_tolerance']) == ['NM_001456']
def test_pre_mRNA_only(flna_annotations, args): transcript = Transcript(*args['transcript_args']) assert flna_annotations.get_annotations( transcript, args['junction_tolerance']) == ['pre-mRNA']
# Initialize list of debaters for candidate in transcript.candidates: sub_debaters.append( Debater(transcript, candidate, avg_words, avg_mentions, avg_accessory, avg_gensent, avg_polisent)) return sub_debaters f = open('training_input.txt', 'w') comma = ', ' links = get_transcripts() # Iterate over transcript links to retrieve only presidential debates transcripts = [ Transcript(url) for url in [ links[12], links[13], links[14], links[36], links[37], links[38], links[75], links[76], links[77], links[81], links[82], links[83] ] + links[104:128] ] debaters = [] # Decompose transcripts into Debater objects for transcript in transcripts: debaters += decompose_transcript(transcript) # Write calculated attributes to file for machine learning use for debater in debaters: f.write(debater.name + comma + debater.date + '\n' + str(debater.word_count) + comma + str(debater.mention_count) +
def get_transcripts(self): for document in self.cursor: yield Transcript(document)
def __init__(self): """Wrap a new transcript in a shared cell for thread-safety.""" transcript = Transcript() self.cell = SharedCell(transcript)
def auto_vid_maker(transcript_path: str, audio_path: str, video_path: str, fps: int = 30, threads: int = 15, silent: bool = False) -> str: """Creates a video of images synced with a transcript and audio recording. Parameters: transcript_path (str): Path to annotated transcript. audio_path (str): Path to audio recording of transcript. video_path (str): Location to create video. Returns: video_path (str): Path to video. """ print("Parsing transcript...") transcript: Transcript = Transcript(transcript_path) image_dir: str = str(uuid.uuid4()) os.mkdir(image_dir) copyfile("beginning_image.jpg", os.path.join(image_dir, "beginning_image.jpg")) print("Downloading images...") with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor: executor.map( lambda x: pull_image( x[0], image_dir, mode=x[1], threads=threads, silent=silent), [(keywords, topic[keywords]) for topic in transcript.topics for keywords in topic]) print("Processing transcript...") gentle_json: Dict = process_with_gentle(transcript.cleaned_transcript, audio_path) timestamps: Dict[str, Dict[str, float]] = process_timestamps(transcript, gentle_json) time_frames: Dict[str, Dict[str, int]] = timestamps_to_frames(timestamps, audio_path, fps=fps) frames_dir: str = os.path.join(image_dir, "frames") os.mkdir(frames_dir) print("Creating frames...") for phrase in time_frames: if phrase != "beginning_image": topic: str = list(transcript.parsed_transcript[phrase].keys())[0] else: topic: str = phrase frame_path: str = draw_frame( os.path.join(image_dir, topic + ".jpg"), phrase, frames_dir, str(time_frames[phrase]["start"]) + ".jpg") for frame_num in range(time_frames[phrase]["start"] + 1, time_frames[phrase]["end"] + 1): copyfile(frame_path, os.path.join(frames_dir, str(frame_num) + ".jpg")) print("Creating video...") temp_vid: str = os.path.join(frames_dir, str(uuid.uuid4()) + ".mp4") cmd: str = f"ffmpeg -r {fps} -f image2 -s 800x600 -i {frames_dir}/%d.jpg -vcodec libx264 -crf 25 -pix_fmt yuv420p {temp_vid}" subprocess.call(cmd, shell=True) subprocess.call(f"ffmpeg -i {audio_path} -i {temp_vid} {video_path}", shell=True) rmtree(image_dir) return os.path.abspath(video_path)
def __init__(self, address, chats): who = Transcript(address, chats) self.cell = SharedCell(who)