Esempio n. 1
0
def compareGTFs(truthGTF, compGTF):
    transcriptsTruth = dict()
    with open(truthGTF, 'r') as tsv:
        for line in tsv:
            row = line.strip().split('\t')
            if len(row) < 5:
                continue

            covIndex = row[8].find('cov')
            covStart = row[8].find('"', covIndex) + 1
            covEnd = row[8].find('"', covStart)
            cov = float(row[8][covStart:covEnd])

            transcriptIdIndex = row[8].find('transcript_id')
            transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1
            transcriptIdEnd = row[8].find('"', transcriptIdStart)
            transcriptId = row[8][transcriptIdStart:transcriptIdEnd]

            if row[2] == 'transcript':
                transcriptsTruth[transcriptId] = Transcript(
                    row[0], int(row[3]), int(row[4]), cov, transcriptId)
            elif row[2] == 'exon':
                transcriptsTruth[transcriptId].exons.append(
                    (int(row[3]), int(row[4])))

    transcriptsTruth = transcriptsTruth.values()

    transcriptsComp = dict()
    with open(compGTF, 'r') as tsv:
        for line in tsv:
            row = line.strip().split('\t')
            if len(row) < 5:
                continue

            covIndex = row[8].find('cov')
            covStart = row[8].find('"', covIndex) + 1
            covEnd = row[8].find('"', covStart)
            cov = float(row[8][covStart:covEnd])

            transcriptIdIndex = row[8].find('transcript_id')
            transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1
            transcriptIdEnd = row[8].find('"', transcriptIdStart)
            transcriptId = row[8][transcriptIdStart:transcriptIdEnd]

            if row[2] == 'transcript':
                transcriptsComp[transcriptId] = Transcript(
                    row[0], int(row[3]), int(row[4]), cov, transcriptId)
            elif row[2] == 'exon':
                transcriptsComp[transcriptId].exons.append(
                    (int(row[3]), int(row[4])))

    transcriptsComp = transcriptsComp.values()

    compareAll(transcriptsTruth, transcriptsComp)
Esempio n. 2
0
def homework_html_to_LaTeX(file_in, soln=False):
    global week_number
    the_homework = Transcript(file_in)
    the_homework_text = the_homework.text
    soup = BeautifulSoup(the_homework_text, 'html.parser')
    print('souped up!')

    if not soln:
        file_name = file_in.strip().split('/')[9][0:-8] + "LaTeXnosoln.txt"
        file_out = open(
            'C:/Users/Justin Yan/Documents/Development/Python/AoPSCleanScript/AoPSCleanScript/homework_LaTeX/'
            + file_name, 'w')
        print('File opened for writing')
    else:
        file_name = file_in.strip().split('/')[9][0:-8] + "LaTeXwithsoln.txt"
        file_out = open(
            'C:/Users/Justin Yan/Documents/Development/Python/AoPSCleanScript/AoPSCleanScript/homework_LaTeX/'
            + file_name, 'w')
        print('File opened for writing')

    week_number = file_name.split('HTML')[0]
    transcribe_preamble(soup, file_out)
    #process the problem body
    transcribe_problems(soup, file_out, soln)

    file_out.write('\end{document}')
Esempio n. 3
0
def processSAM(sam, genome):
    # This function extracts the SAM header (because we'll need that later) and creates a Transcript object for every sam transcript. 
    # Transcripts are returned two separate lists: one canonical and one noncanonical. 

    header = ""
    canTranscripts = {}
    noncanTranscripts = {} 
    #unmodifiedTranscripts = {} # Place to put transcripts that didn't map or multimapped.
    with open(sam, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith("@"):
                header = header + line + "\n"
                continue
            t = Transcript(line, genome)
            #print Transcript.getNMandMDFlags(t, genome)
            
            # Filter out transcripts that are multimapping
            if int(t.FLAG) > 16:
                continue
                #unmodifiedTranscripts[t.QNAME] = t
            # Skip unmapped transcripts altogether
            if t.CHROM == "*":
                continue
            if t.isCanonical == True:
                canTranscripts[t.QNAME] = t
            else:
                noncanTranscripts[t.QNAME] = t
    return header, canTranscripts, noncanTranscripts #, unmodifiedTranscripts
Esempio n. 4
0
    def check_level(self, line):
        if int(line.attrs['level']) < int(self.transcript.attrs['level']):
            self.transcript = Transcript(line)
            self.type = 'level'
            return True

        return False
Esempio n. 5
0
def parsePro(filename):
    ''' Return a dictionary with transcript id (e.g. 0300689) pointing to coverage level
    '''
    threshold = 0.00005

    transcripts = dict()
    with open(filename, 'r') as f:
        for line in f:
            row = line.strip().split('\t')
            if len(row) < 8:
                continue

            tag = row[1]
            sep1 = row[0].find(':')
            sep2 = row[0].find('-', sep1)
            sep3 = row[0].find('W', sep2)
            chrom = row[0][:sep1]
            start = int(row[0][sep1 + 1:sep2])
            end = int(row[0][sep2 + 1:sep3])
            fraction = float(row[8])
            #cov = float(row[7])
            #if cov > 0:
            #    fraction = float(row[11]) / cov

            if fraction > threshold:
                transcripts[tag] = Transcript(chrom, start, end, fraction, tag)
    return transcripts
Esempio n. 6
0
 def check_length(self, line):
     alt_transcript = Transcript(line)
     if alt_transcript.length > self.transcript.length:
         self.type = 'length'
         self.transcript = alt_transcript
         return True
     return False
Esempio n. 7
0
def load_assembled_transcripts(filename, ref_genome):
    datafile = open(filename)
    data = datafile.readlines()
    list_transcripts = []
    for line in data:
        column = line.split('\t')
        if column[2] == "transcript":
            try:
                list_transcripts.append(itranscript)
            except:
                pass
            transcript_info = column[8].split(' ')
            transcript_id = transcript_info[3].strip()
            gene_id = transcript_info[1].strip()
            chromosome = column[0]
            if chromosome in ref_genome.chromosomes_dict:
                fpkm = transcript_info[7].strip(";").strip("\"")
                sign = column[6].strip()
                itranscript = Transcript(transcript_id, gene_id, chromosome, fpkm, sign)
            else:
                pass
        elif column[2] == "exon":
            if column[0] in ref_genome.chromosomes_dict:
                transcript_info = column[8].split(' ')
                exon_start = int(column[3])
                exon_end = int(column[4])
                if itranscript.id == transcript_info[3]:
                    itranscript.add_exon([exon_start, exon_end],
                                         ref_genome.chromosomes_dict[itranscript.chromosome][exon_start - 1:exon_end])
                else:
                    print 'WARNING ', transcript_info, itranscript.id
            else:
                pass
    return list_transcripts
Esempio n. 8
0
def main():
    uiRoot = tkinter.Tk()
    uiRoot.configure(background="black")

    textView = tkinter.Text(uiRoot, font=('Tiresias', 21))
    textView.configure(background='black')
    textView.tag_config('unstable', foreground='gray')
    textView.tag_config('stable', foreground='white')

    language_code = 'nl-NL'  # a BCP-47 language tag
    model = Transcript()
    service = TranscriptionService(language_code, model)

    def close_window():
        service.stop()
        uiRoot.destroy()

    uiRoot.protocol("WM_DELETE_WINDOW", close_window)

    textView.after(50, updateUI, textView, model)
    textView.pack()

    stopButton = tkinter.Button(uiRoot)
    buttonDecoration = Switch(stopButton, service)
    stopButton.pack(fill=tkinter.X)

    uiRoot.mainloop()
Esempio n. 9
0
def compareGTFs(truthGTF, compGTF):
    transcriptsTruth = dict()
    with open(truthGTF, 'r') as tsv:
        for line in tsv:
            row = line.strip().split('\t')
            if len(row) < 5:
                continue

            transcriptIdIndex = row[8].find('transcript_id')
            transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1
            transcriptIdEnd = row[8].find('"', transcriptIdStart)
            transcriptId = row[8][transcriptIdStart:transcriptIdEnd]

            print(row[2])

            if row[2] == 'transcript':
                print('Found transcript ' + str(transcriptId))
                transcriptsTruth[transcriptId] = Transcript(
                    row[0], int(row[3]), int(row[4]), 1, transcriptId)
            elif row[2] == 'exon' and transcriptId in transcriptsTruth:
                transcriptsTruth[transcriptId].exons.append(
                    (int(row[3]), int(row[4])))

    transcriptsTruth = transcriptsTruth.values()

    transcriptsComp = dict()
    with open(compGTF, 'r') as tsv:
        for line in tsv:
            row = line.strip().split('\t')
            if len(row) < 5:
                continue

            transcriptIdIndex = row[8].find('transcript_id')
            transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1
            transcriptIdEnd = row[8].find('"', transcriptIdStart)
            transcriptId = row[8][transcriptIdStart:transcriptIdEnd]

            if row[2] == 'transcript':
                transcriptsComp[transcriptId] = Transcript(
                    row[0], int(row[3]), int(row[4]), 1, transcriptId)
            elif row[2] == 'exon' and transcriptId in transcriptsComp:
                transcriptsComp[transcriptId].exons.append(
                    (int(row[3]), int(row[4])))

    transcriptsComp = transcriptsComp.values()

    compareAll(transcriptsTruth, transcriptsComp)
Esempio n. 10
0
    def use_the_non_NA_transcript_supported(self, line):
        if line.attrs['transcript_support_level'] == 'NA':
            return True

        if self.transcript.attrs['transcript_support_level'] == 'NA':
            self.transcript = Transcript(line)
            self.type = 'transcript_support_level'
            return True

        return False
Esempio n. 11
0
    def check_MANE_dataset(self, line):
        if 'MANE_Select' in self.transcript.attrs['tags']:
            self.type = 'MANE_Select'
            return True

        elif 'MANE_Select' in line.attrs['tags']:
            self.transcript = Transcript(line)
            self.type = 'MANE_Select'
            return True

        return False
Esempio n. 12
0
 def check_first_transcript(self, line):
     if "CCDS" in line.attrs[
             'tags']:  #check that it's a member of the consensus CDS gene set
         self.transcript = Transcript(line)
         if 'MANE_Select' in self.transcript.attrs[
                 'tags']:  #the transcript belongs to the MANE Select data set
             self.type = 'MANE_Select'
         else:
             self.type = 'only_transcript'
     else:
         self.type = 'one_rejected_transcript'
Esempio n. 13
0
    def check_support_level(self, line):
        line_transcript_support_level = int(
            line.attrs['transcript_support_level'])
        original_trnascript_support_level = int(
            self.transcript.attrs['transcript_support_level'])

        if line_transcript_support_level < original_trnascript_support_level:
            self.transcript = Transcript(line)
            self.type = 'transcript_support_level'
            return True

        return False
Esempio n. 14
0
    def check_CCDS(self, line):
        if "CCDS" in line.attrs[
                'tags'] and "CCDS" not in self.transcript.attrs['tags']:
            self.transcript = Transcript(line)
            self.type = 'CCDS'
            return True

        elif "CCDS" not in line.attrs[
                'tags'] and "CCDS" in self.transcript.attrs['tags']:
            return True

        return False
Esempio n. 15
0
def compareGTFs(proFile, truthGTF, compGTF):
    # file 1 is a .pro file output by flux
    transcriptsTruth = parsePro(proFile)

    with open(truthGTF, 'r') as tsv:
        for line in tsv:
            row = line.strip().split('\t')
            if len(row) < 5:
                continue

            transcriptIdIndex = row[8].find('transcript_id')
            transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1
            transcriptIdEnd = row[8].find('"', transcriptIdStart)
            transcriptId = row[8][transcriptIdStart:transcriptIdEnd]

            #if row[2] == 'transcript':
            #    transcriptsTruth[transcriptId] = Transcript(row[0], int(row[3]), int(row[4]), transcriptCovs[transcriptId])

            #if row[1] == 'protein_coding' and row[2] == 'exon' and transcriptId in transcriptsTruth:
            if row[2] == 'exon' and transcriptId in transcriptsTruth:
                transcriptsTruth[transcriptId].exons.append(
                    (int(row[3]), int(row[4])))
    transcriptsTruth = transcriptsTruth.values()

    transcriptsComp = dict()
    with open(compGTF, 'r') as tsv:
        for line in tsv:
            row = line.strip().split('\t')
            if len(row) < 5:
                continue

            covIndex = row[8].find('cov')
            covStart = row[8].find('"', covIndex) + 1
            covEnd = row[8].find('"', covStart)
            cov = float(row[8][covStart:covEnd])

            transcriptIdIndex = row[8].find('transcript_id')
            transcriptIdStart = row[8].find('"', transcriptIdIndex) + 1
            transcriptIdEnd = row[8].find('"', transcriptIdStart)
            transcriptId = row[8][transcriptIdStart:transcriptIdEnd]

            if row[2] == 'transcript':
                transcriptsComp[transcriptId] = Transcript(
                    row[0], int(row[3]), int(row[4]), cov, transcriptId)
            elif row[2] == 'exon':
                transcriptsComp[transcriptId].exons.append(
                    (int(row[3]), int(row[4])))

    transcriptsComp = transcriptsComp.values()

    compareAll(transcriptsTruth, transcriptsComp)
Esempio n. 16
0
File: merge.py Progetto: neevor/grit
def build_merged_transcript(gene_id, clustered_transcripts):
    # find hte transcript bounds
    start, stop = 1e20, 0
    for transcript in clustered_transcripts:
        start = min(start, transcript.exons[0][0])
        stop = max(stop, transcript.exons[-1][-1])

    # merge the promoters
    try:
        new_promoter = (min(t.promoter[0] for t in clustered_transcripts
                            if t.promoter != None),
                        max(t.promoter[1] for t in clustered_transcripts
                            if t.promoter != None))
    except ValueError:
        new_promoter = None

    # merge the polyas
    try:
        new_polya = (min(t.polya_region[0] for t in clustered_transcripts
                         if t.polya_region != None),
                     max(t.polya_region[1] for t in clustered_transcripts
                         if t.polya_region != None))
    except ValueError:
        new_polya = None

    # choose a tempalte transcript, and make sure that all of the
    # clustered transcripts have the same internal structure (
    # this should be guaranteed by the calling function )
    bt = clustered_transcripts[0]
    assert all(t.IB_key() == bt.IB_key() for t in clustered_transcripts)
    new_exons = list(bt.exons)
    new_exons[0] = (start, new_exons[0][1])
    new_exons[-1] = (new_exons[-1][0], stop)
    # choose a random id - this should be renamed in the next step
    new_trans_id = gene_id + "_RNDM_%i" % random.randrange(1e9)
    new_transcript = Transcript(new_trans_id,
                                bt.chrm,
                                bt.strand,
                                new_exons,
                                bt.cds_region,
                                gene_id,
                                name=bt.name,
                                gene_name=bt.gene_name,
                                promoter=new_promoter,
                                polya_region=new_polya)

    return new_transcript
Esempio n. 17
0
def build_gene(elements, fasta=None, ref_genes=None):
    gene_min = min(
        min(e) for e in chain(elements.tss_exons, elements.tes_exons,
                              elements.se_transcripts))
    gene_max = max(
        max(e) for e in chain(elements.tss_exons, elements.tes_exons,
                              elements.se_transcripts))

    transcripts = []
    for i, exons in enumerate(
            build_transcripts_from_elements(elements.tss_exons,
                                            elements.internal_exons,
                                            elements.tes_exons,
                                            elements.se_transcripts,
                                            elements.introns,
                                            elements.strand)):
        transcript = Transcript("%s_%i" % (elements.id, i),
                                elements.chrm,
                                elements.strand,
                                exons,
                                cds_region=None,
                                gene_id=elements.id)
        transcript.promoter = find_matching_promoter_for_transcript(
            transcript, elements.promoter)
        transcript.polya_region = find_matching_polya_region_for_transcript(
            transcript, elements.polyas)
        transcripts.append(transcript)

    if len(transcripts) == 0:
        return None

    gene = Gene(elements.id, elements.id, elements.chrm, elements.strand,
                gene_min, gene_max, transcripts)

    if fasta != None:
        gene.transcripts = find_cds_for_gene(gene,
                                             fasta,
                                             only_longest_orf=True)

    if ref_genes != None:
        gene = rename_transcripts(gene, ref_genes)

    return gene
def lect_to_TeX(args):
    # file to be read
    file_in = args.file_in
    file_out = args.file_out
    image_path = args.image_out
    file_name = args.file_name

    #instantiate Transcript object
    the_transcript = Transcript(file_in)
    #access string instance var containing HTML text
    transcript_text = the_transcript.text
    #BeautifulSoup object allows easier traverse of HTML text
    soup = BeautifulSoup(transcript_text, 'html.parser')

    O = open(file_out + file_name, 'w')

    transcribe_preamble(soup, O, image_path)

    transcribe_msgs(soup, O, image_path)

    O.write(r'\end{document}')
    counter = 0
Esempio n. 19
0
def cli():
    print(f'###########################\n'
          '#   GPA Calculator v0.3   #\n'
          '# Developed by Daanish KS #\n'
          '###########################\n')

    session = PromptSession()  # Enables file path history for convenience

    while True:
        csv_file = session.prompt('Transcript CSV file path: ',
                                  completer=file_completion(),
                                  validator=file_validation(),
                                  validate_while_typing=True)
        x = Transcript(csv_file)

        file_request = prompt('Write GPA report to file [y/n]? ',
                              validator=yes_no_validation(),
                              validate_while_typing=True)

        if file_request in {'Y', 'y', 'YES', 'Yes', 'yes'}:
            report_type = prompt('JSON [1] or YAML [2]? ',
                                 validator=report_type_validation(),
                                 validate_while_typing=True)
            if report_type == '1':
                x.gpa_report_to_file(file_path='gpa_report.json')
            if report_type == '2':
                x.gpa_report_to_file(file_path='gpa_report.yaml')
        print()
        yaml.dump(x.gpa_report(round_place=3), sys.stdout)
        print()

        repeat_request = prompt('Continue [y/n]? ',
                                validator=yes_no_validation(),
                                validate_while_typing=True)
        if repeat_request in {'N', 'n', 'NO', 'No', 'no'}:
            break
        else:
            print(f'\n-------------------------\n')
Esempio n. 20
0
    def __init__(self, game, x, y):
        self.groups = game.all_sprites
        pg.sprite.Sprite.__init__(self, self.groups)
        self.game = game
        self.images = {'normal': pg.image.load(path.join(game.img_folder, "apple_64px.png")).convert_alpha(), \
                       'blink': pg.image.load(path.join(game.img_folder, "apple_64px_blink.png")).convert_alpha(), \
                       'wink': pg.image.load(path.join(game.img_folder, "apple_64px_wink.png")).convert_alpha()}
        self.blinks = False
        self.blink_time = .25
        self.staring_time = 3
        self.start_time = time.time()
        self.image = self.images['normal']
        self.rect = self.image.get_rect()
        self.rect.center = (x, y)
        self.hit_rect = self.rect
        self.hit_rect.center = self.rect.center
        self.vel = vec(0, 0)
        self.position = vec(x, y)
        self.dest = vec_dest(x, y)
        self.previous_pos = vec_prev(x, y)
        self.instruction = ""
        self.orientation = "front" # left, right, front, back
        self.name = "Young Apple"
        self.silence_responses = ["can you please say that again?", "oops, I missed that. say again?", "I heard *silence*", "repeat again, please?", "could you say again?", "I didn't hear that, try again?", "I heard *silence*"]
        self.knowledge = Knowledge(self)
        self.transcript = Transcript()

        # Working memory properties
        self.recognized = []
        self.actions = [] # current, complete list of action sequences e.g. [[1],[[0],[2]]]
        self.input_to_actions = []
        self.action_queue = [] # remaining actions to be completed
        self.current_action = []
        self.key_used = ""
        #self.responses = []
        self.response = ""
Esempio n. 21
0
def test_raises_no_mapped_segments(alignments):
    with pytest.raises(NoMappedSegmentsError):
        Transcript(alignments, DEFAULT_SKIP, DEFAULT_MAP)
Esempio n. 22
0
def test_NM_001456_only(flna_annotations, args):
    transcript = Transcript(*args['transcript_args'])
    assert flna_annotations.get_annotations(
        transcript, args['junction_tolerance']) == ['NM_001456']
Esempio n. 23
0
def test_pre_mRNA_only(flna_annotations, args):
    transcript = Transcript(*args['transcript_args'])
    assert flna_annotations.get_annotations(
        transcript, args['junction_tolerance']) == ['pre-mRNA']
    # Initialize list of debaters
    for candidate in transcript.candidates:
        sub_debaters.append(
            Debater(transcript, candidate, avg_words, avg_mentions,
                    avg_accessory, avg_gensent, avg_polisent))

    return sub_debaters


f = open('training_input.txt', 'w')
comma = ', '
links = get_transcripts()

# Iterate over transcript links to retrieve only presidential debates
transcripts = [
    Transcript(url) for url in [
        links[12], links[13], links[14], links[36], links[37], links[38],
        links[75], links[76], links[77], links[81], links[82], links[83]
    ] + links[104:128]
]

debaters = []

# Decompose transcripts into Debater objects
for transcript in transcripts:
    debaters += decompose_transcript(transcript)

# Write calculated attributes to file for machine learning use
for debater in debaters:
    f.write(debater.name + comma + debater.date + '\n' +
            str(debater.word_count) + comma + str(debater.mention_count) +
Esempio n. 25
0
 def get_transcripts(self):
     for document in self.cursor:
         yield Transcript(document)
 def __init__(self):
     """Wrap a new transcript in a shared cell for
     thread-safety."""
     transcript = Transcript()
     self.cell = SharedCell(transcript)
Esempio n. 27
0
def auto_vid_maker(transcript_path: str,
                   audio_path: str,
                   video_path: str,
                   fps: int = 30,
                   threads: int = 15,
                   silent: bool = False) -> str:
    """Creates a video of images synced with a transcript and audio
    recording.

    Parameters:
    transcript_path (str): Path to annotated transcript.
    audio_path (str): Path to audio recording of transcript.
    video_path (str): Location to create video.

    Returns:
    video_path (str): Path to video.
    """
    print("Parsing transcript...")
    transcript: Transcript = Transcript(transcript_path)

    image_dir: str = str(uuid.uuid4())
    os.mkdir(image_dir)
    copyfile("beginning_image.jpg",
             os.path.join(image_dir, "beginning_image.jpg"))

    print("Downloading images...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
        executor.map(
            lambda x: pull_image(
                x[0], image_dir, mode=x[1], threads=threads, silent=silent),
            [(keywords, topic[keywords]) for topic in transcript.topics
             for keywords in topic])

    print("Processing transcript...")

    gentle_json: Dict = process_with_gentle(transcript.cleaned_transcript,
                                            audio_path)
    timestamps: Dict[str,
                     Dict[str,
                          float]] = process_timestamps(transcript, gentle_json)
    time_frames: Dict[str, Dict[str, int]] = timestamps_to_frames(timestamps,
                                                                  audio_path,
                                                                  fps=fps)
    frames_dir: str = os.path.join(image_dir, "frames")
    os.mkdir(frames_dir)

    print("Creating frames...")

    for phrase in time_frames:
        if phrase != "beginning_image":
            topic: str = list(transcript.parsed_transcript[phrase].keys())[0]
        else:
            topic: str = phrase
        frame_path: str = draw_frame(
            os.path.join(image_dir, topic + ".jpg"), phrase, frames_dir,
            str(time_frames[phrase]["start"]) + ".jpg")

        for frame_num in range(time_frames[phrase]["start"] + 1,
                               time_frames[phrase]["end"] + 1):
            copyfile(frame_path,
                     os.path.join(frames_dir,
                                  str(frame_num) + ".jpg"))

    print("Creating video...")

    temp_vid: str = os.path.join(frames_dir, str(uuid.uuid4()) + ".mp4")
    cmd: str = f"ffmpeg -r {fps} -f image2 -s 800x600 -i {frames_dir}/%d.jpg -vcodec libx264 -crf 25  -pix_fmt yuv420p {temp_vid}"
    subprocess.call(cmd, shell=True)
    subprocess.call(f"ffmpeg -i {audio_path} -i {temp_vid} {video_path}",
                    shell=True)

    rmtree(image_dir)

    return os.path.abspath(video_path)
Esempio n. 28
0
 def __init__(self, address, chats):
     who = Transcript(address, chats)
     self.cell = SharedCell(who)