def translateString(): translateFile = open("translate.txt", "a+") try: for row_index in range(1, read_table.nrows): error_reason_ch = read_table.cell(row_index, index_reason_ch).value error_reason_en = translate(error_reason_ch) translateFile.write("%s->%s\n" % (error_reason_ch, error_reason_en)) finally: translateFile.close()
def run(self): with open('.\\config\\settin.json') as file: data = json.load(file) if not self.mode: try: result = translate(self.window, data) self._signal.emit(result) except Exception: print_exc() else: data["sign"] += 1 with open('.\\config\\settin.json', 'w') as file: json.dump(data, file) try: if data["sign"] % 2 == 0: self.window.StartButton.setIcon( qtawesome.icon('fa.pause', color='white')) while True: with open('.\\config\\settin.json') as file: data = json.load(file) if data["sign"] % 2 == 0: try: result = translate(self.window, data) self._signal.emit(result) sec = data["translateSpeed"] - 0.9 time.sleep(sec) except Exception: print_exc() break else: self.window.StartButton.setIcon( qtawesome.icon('fa.play', color='white')) break except Exception: print_exc()
def translateData(src_ch, des_en, title=None): """ 翻译错误原因, 写回表格 """ try: if title is not None: print("Costume title:", title) write_table.write(0, des_en, title) else: write_table.write(0, des_en, "翻译") for row_index in range(1, read_table.nrows): error_reason_ch = read_table.cell(row_index, src_ch).value error_reason_en = translate(error_reason_ch) write_table.write(row_index, des_en, error_reason_en) finally: write_data.save(fileName) print("翻译 写入数据成功!")
def run_evaluation(model, source_vocab, target_vocabs, device, beam_size, filenames, ref_files, max_length): ''' This method builds a model from scratch or using the encoder of a pre-trained model model: the model being evaluated source_vocabs: the source vocabulary for each file target_vocabs: the target vocabulary for each file beam_size: beam size during the translating filenames: filenames of triples to process ref_files: filenames with gold-standards for each process max_length: max length of a sentence ''' accuracies = [] for index, eval_name in enumerate(filenames): eval_ref = ref_files[index] eval_ref, corpus = '/'.join(eval_ref.split('/')[:-1]), eval_ref.split('/')[-1] references = [] for i, fname in enumerate(sorted(os.listdir(eval_ref))): if corpus in fname: path = os.path.join(eval_ref, fname) with open(path) as f: doc = f.read().split('\n') if i == 0: references = [[w] for w in doc] else: for i, ref in enumerate(doc): references[i].append(ref) n = len(eval_name.split("/")) name = eval_name.split("/")[n-1] print(f'Reading {eval_name}') with open(eval_name, "r") as f: outputs = translate(model, index, f, source_vocab, target_vocabs[index], device, beam_size=beam_size, max_length=max_length) acc = 0.0 for j, output in enumerate(outputs): if output.replace("<eos>","").strip().lower() in [w.lower() for w in references[j]]: acc += 1 acc /= len(outputs) accuracies.append(acc) return accuracies
def parse(file_name, target_name): fp = open(file_name, 'rb') praser = PDFParser(fp) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_number = 1 for page in doc.get_pages(): print('page: ' + str(page_number)) interpreter.process_page(page) layout = device.get_result() # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象 # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 # 想要获取文本就获得对象的text属性 for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(target_name, 'a') as f: results = x.get_text() translate_text = translate(results) f.write(translate_text + '\n') # if (isinstance(x, LTImage)): # with open('patternColoring.txt', 'a') as f: # results = x.get_image() # f.write('###########\n' + results + '\n') page_number += 1
def run_translate(model, source_vocab, target_vocabs, save_dir, device, beam_size, filenames, max_length): ''' This method builds a model from scratch or using the encoder of a pre-trained model model: the model being evaluated source_vocabs: the source vocabulary for each file target_vocabs: the target vocabulary for each file save_dir: path where the outpus will be saved beam_size: beam size during the translating filenames: filenames of triples to process max_length: max length of a sentence ''' for index, eval_name in enumerate(filenames): n = len(eval_name.split("/")) name = eval_name.split("/")[n-1] print(f'Reading {eval_name}') fout = open(save_dir + name + "." + str(index) + ".out", "w") with open(eval_name, "r") as f: outputs = translate(model, index, f, source_vocab, target_vocabs[index], device, beam_size=beam_size, max_length=max_length) for output in outputs: fout.write(output.replace("<eos>","").strip() + "\n") fout.close()
def single_file(): file = "FastaFiles/MT019529.txt" genome = make_genome_from_txt(file) sequence = genome['sequence'] #29899 # https://www.ncbi.nlm.nih.gov/nuccore/MT019529 # 5'UTR # https://www.ncbi.nlm.nih.gov/nuccore/MT019529.1?from=1&to=265 # 1..265 # _5UTR = sequence[:265] # Its link is: the same + ".1?location=266:13468,13468:21555" # Basically encach section has a "gene" and a "CDS" section # gene: gives us the overall start finish and gene name # LINK: source, gene, translation, and genome # CDS: this gives us all the real information that we need. it is basically # the gene section but with more info. This does have a protein id link # in it that you can use. # LINK: source, gene, translation, genome # PROTEINIDLINK: source, gene, translation # # KEY TAKEAWAY # the links themselves aren't that helpful bc everything in the links are # already shown in the main link's info # We get: start, stop, gene name, product name, translation, protein id, etc # join(266..13468,13468..21555) # gene = orf1ab # ribosomal_slippage What does this mean # note= pp1ab; translated by -1 ribosomal frameshift Correlation? # product = orf1ab polyprotein look what that is # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293612 # translation = ... orf1ab = sequence[265:21555] # -> gets translated to the translation # TODO: probably gonna wanna get the translation at some point # GAP OF STUFF FROM 21555 -> 21563 (13) # 21563..25384 # gene = s # note = structural protein # product = surface glycoprotein wtf is that # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293613 # translation = ... s = sequence[21562:25384] # GAP 25384 -> 25393 (9) # 25393..26220 # gene = orf3a # product = orf3a protein # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293614 # translation = ... orf3a = sequence[25392:26220] # Gap 26220 -> 26245 (25) # 26245..26472 # gene = e # product = envelope protein # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293615 # translation = .. e = sequence[26244:26472] # Gap 26472 -> 26523 (51) # 26523..27191 # gene = m # note = structual protein # product = membrane glycoprotein # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293616 # translation = ... m = sequence[26522:27191] # Gap 27191 -> 27202 (11) # 27202..27387 # gene = orf6 # product = orf 6 protein # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293617 # translation = ... orf6 = sequence[27201:27387] # Gap 27387 -> 27394 (7) # 27394..27759 # gene = orf7a # product = orf7a protein # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293618 # translation = .... orf7a = sequence[27393:27759] # Gap 27759 -> 27894 (135) # 27894..28259 # gene = orf8 # product = orf8 protein # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293619 # translation = ... orf8 = sequence[27893:28259] # Gap 28259 -> 28274 (15) # 28274..29533 # gene = N # note = structual protein # product = nucleocapsid phosphoprotein # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293620 # translation = ... n = sequence[28273:29533] # Gap 29533 -> 29558 (25) # 29558..29674 # gene = orf10 # product = orf10 protein # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293621 # translation = ... orf10 = sequence[29557:29674] # 3'UTR # 29675..29899 _3_utr = sequence[29674:29899] print(translate(orf7a))