예제 #1
0
def translateString():
    translateFile = open("translate.txt", "a+")
    try:
        for row_index in range(1, read_table.nrows):
            error_reason_ch = read_table.cell(row_index, index_reason_ch).value
            error_reason_en = translate(error_reason_ch)
            translateFile.write("%s->%s\n" %
                                (error_reason_ch, error_reason_en))
    finally:
        translateFile.close()
예제 #2
0
    def run(self):

        with open('.\\config\\settin.json') as file:
            data = json.load(file)

        if not self.mode:
            try:
                result = translate(self.window, data)
                self._signal.emit(result)
            except Exception:
                print_exc()
        else:
            data["sign"] += 1
            with open('.\\config\\settin.json', 'w') as file:
                json.dump(data, file)
            try:
                if data["sign"] % 2 == 0:
                    self.window.StartButton.setIcon(
                        qtawesome.icon('fa.pause', color='white'))

                while True:

                    with open('.\\config\\settin.json') as file:
                        data = json.load(file)

                    if data["sign"] % 2 == 0:
                        try:
                            result = translate(self.window, data)
                            self._signal.emit(result)
                            sec = data["translateSpeed"] - 0.9
                            time.sleep(sec)

                        except Exception:
                            print_exc()
                            break
                    else:
                        self.window.StartButton.setIcon(
                            qtawesome.icon('fa.play', color='white'))
                        break

            except Exception:
                print_exc()
예제 #3
0
def translateData(src_ch, des_en, title=None):
    """
    翻译错误原因, 写回表格
    """
    try:
        if title is not None:
            print("Costume title:", title)
            write_table.write(0, des_en, title)
        else:
            write_table.write(0, des_en, "翻译")

        for row_index in range(1, read_table.nrows):
            error_reason_ch = read_table.cell(row_index, src_ch).value
            error_reason_en = translate(error_reason_ch)
            write_table.write(row_index, des_en, error_reason_en)
    finally:
        write_data.save(fileName)
        print("翻译 写入数据成功!")
def run_evaluation(model, source_vocab, target_vocabs, device, beam_size, filenames, ref_files, max_length):
    '''
        This method builds a model from scratch or using the encoder of a pre-trained model
        model: the model being evaluated
        source_vocabs: the source vocabulary for each file
        target_vocabs: the target vocabulary for each file
        beam_size: beam size during the translating
        filenames: filenames of triples to process
        ref_files: filenames with gold-standards for each process
        max_length: max length of a sentence
    '''

    accuracies = []
    for index, eval_name in enumerate(filenames):
        eval_ref = ref_files[index]
        eval_ref, corpus = '/'.join(eval_ref.split('/')[:-1]), eval_ref.split('/')[-1]

        references = []
        for i, fname in enumerate(sorted(os.listdir(eval_ref))):
            if corpus in fname:
                path = os.path.join(eval_ref, fname)
                with open(path) as f:
                    doc = f.read().split('\n')
                    if i == 0:
                        references = [[w] for w in doc]
                    else:
                        for i, ref in enumerate(doc):
                            references[i].append(ref)
        
        n = len(eval_name.split("/"))
        name = eval_name.split("/")[n-1]
        print(f'Reading {eval_name}')
        with open(eval_name, "r") as f:
            outputs = translate(model, index, f, source_vocab, target_vocabs[index], device, 
                            beam_size=beam_size, max_length=max_length)
        acc = 0.0
        for j, output in enumerate(outputs):
            if output.replace("<eos>","").strip().lower() in [w.lower() for w in references[j]]:
                acc += 1
        acc /= len(outputs)
        accuracies.append(acc)
    return accuracies
예제 #5
0
def parse(file_name, target_name):
    fp = open(file_name, 'rb')
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)

    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        page_number = 1
        for page in doc.get_pages():
            print('page: ' + str(page_number))
            interpreter.process_page(page)
            layout = device.get_result()
            # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            # 想要获取文本就获得对象的text属性
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(target_name, 'a') as f:
                        results = x.get_text()
                        translate_text = translate(results)
                        f.write(translate_text + '\n')
                # if (isinstance(x, LTImage)):
                #     with open('patternColoring.txt', 'a') as f:
                #         results = x.get_image()
                #         f.write('###########\n' + results + '\n')
            page_number += 1
def run_translate(model, source_vocab, target_vocabs, save_dir, device, beam_size, filenames, max_length):
    '''
        This method builds a model from scratch or using the encoder of a pre-trained model
        model: the model being evaluated
        source_vocabs: the source vocabulary for each file
        target_vocabs: the target vocabulary for each file
        save_dir: path where the outpus will be saved
        beam_size: beam size during the translating
        filenames: filenames of triples to process
        max_length: max length of a sentence
    '''


    for index, eval_name in enumerate(filenames):
        n = len(eval_name.split("/"))
        name = eval_name.split("/")[n-1]
        print(f'Reading {eval_name}')
        fout = open(save_dir + name + "." + str(index) + ".out", "w")
        with open(eval_name, "r") as f:
            outputs = translate(model, index, f, source_vocab, target_vocabs[index], device, 
                            beam_size=beam_size, max_length=max_length)
            for output in outputs:
                fout.write(output.replace("<eos>","").strip() + "\n")
        fout.close()
예제 #7
0
def single_file():
    file = "FastaFiles/MT019529.txt"
    genome = make_genome_from_txt(file)

    sequence = genome['sequence']  #29899

    # https://www.ncbi.nlm.nih.gov/nuccore/MT019529

    # 5'UTR
    # https://www.ncbi.nlm.nih.gov/nuccore/MT019529.1?from=1&to=265
    # 1..265
    #
    _5UTR = sequence[:265]

    # Its link is: the same + ".1?location=266:13468,13468:21555"

    # Basically encach section has a "gene" and a "CDS" section
    # gene: gives us the overall start finish and gene name
    #       LINK: source, gene, translation, and genome
    # CDS: this gives us all the real information that we need. it is basically
    #      the gene section but with more info. This does have a protein id link
    #      in it that you can use.
    #      LINK: source, gene, translation, genome
    #      PROTEINIDLINK: source, gene, translation
    #
    # KEY TAKEAWAY
    # the links themselves aren't that helpful bc everything in the links are
    # already shown in the main link's info
    # We get: start, stop, gene name, product name, translation, protein id, etc

    # join(266..13468,13468..21555)
    # gene = orf1ab
    # ribosomal_slippage                                    What does this mean
    # note= pp1ab; translated by -1 ribosomal frameshift      Correlation?
    # product = orf1ab polyprotein                          look what that is
    # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293612
    # translation = ...
    orf1ab = sequence[265:21555]  # -> gets translated to the translation
    # TODO: probably gonna wanna get the translation at some point

    # GAP OF STUFF FROM 21555 -> 21563 (13)

    # 21563..25384
    # gene = s
    # note = structural protein
    # product = surface glycoprotein                        wtf is that
    # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293613
    # translation = ...
    s = sequence[21562:25384]

    # GAP 25384 -> 25393 (9)

    # 25393..26220
    # gene = orf3a
    # product = orf3a protein
    # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293614
    # translation = ...
    orf3a = sequence[25392:26220]

    # Gap 26220 -> 26245 (25)

    # 26245..26472
    # gene = e
    # product = envelope protein
    # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293615
    # translation = ..
    e = sequence[26244:26472]

    # Gap 26472 -> 26523 (51)

    # 26523..27191
    # gene = m
    # note = structual protein
    # product = membrane glycoprotein
    # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293616
    # translation = ...
    m = sequence[26522:27191]

    # Gap 27191 -> 27202 (11)

    # 27202..27387
    # gene = orf6
    # product = orf 6 protein
    # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293617
    # translation = ...
    orf6 = sequence[27201:27387]

    # Gap 27387 -> 27394 (7)

    # 27394..27759
    # gene = orf7a
    # product = orf7a protein
    # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293618
    # translation = ....
    orf7a = sequence[27393:27759]

    # Gap 27759 -> 27894 (135)

    # 27894..28259
    # gene = orf8
    # product = orf8 protein
    # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293619
    # translation = ...
    orf8 = sequence[27893:28259]

    # Gap 28259 -> 28274 (15)

    # 28274..29533
    # gene = N
    # note = structual protein
    # product = nucleocapsid phosphoprotein
    # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293620
    # translation = ...
    n = sequence[28273:29533]

    # Gap 29533 -> 29558 (25)

    # 29558..29674
    # gene = orf10
    # product = orf10 protein
    # protein id = https://www.ncbi.nlm.nih.gov/protein/1805293621
    # translation = ...
    orf10 = sequence[29557:29674]

    # 3'UTR
    # 29675..29899
    _3_utr = sequence[29674:29899]

    print(translate(orf7a))