Exemple #1
0
def fetch_sentences():
    global LANGUAGES
    basename = request.args.get('basename')
    totalcount = 0
    (sentencesobj, totalcount) = Sentence.limit(request.args.get('pagesize'), basename, request.args.get('status'),
                                                request.args.get('pageno'))
    corpus_obj = Corpus.objects(basename=basename)
    corpus_dict = json.loads(corpus_obj.to_json())
    sentences_list = []
    sources = []
    if sentencesobj is not None:
        for sent in sentencesobj:
            sent_dict = json.loads(sent.to_json())
            corpus = Sentence.objects(_id=sent_dict['_id']['$oid'])
            if sent_dict['status'] == STATUS_PENDING:
                corpus.update(set__status=STATUS_PROCESSING)
            sources.append(sent_dict['source'])
        target_lang = 'en'
        if 'target_lang' in corpus_dict[0] and corpus_dict[0]['target_lang'] is not None:
            target_lang = LANGUAGES[corpus_dict[0]['target_lang']]
        translation_list = translatesinglesentence(sources, target_lang)
        index = 0
        for sent in sentencesobj:
            sent_dict = json.loads(sent.to_json())
            sent_dict['translation'] = translation_list[index]
            sentences_list.append(sent_dict)
            index += 1
            # print() 
        # for sentence in sentencesobj:
        #     # sentence.update(set__status=STATUS_PROCESSING, set__locked=True, set__locked_time=datetime.now())
        #     sentence.update(set__status=STATUS_PROCESSING)
    res = CustomResponse(Status.SUCCESS.value, sentences_list, totalcount)
    return res.getres()
Exemple #2
0
def remove_junk():
    basename = str(int(time.time()))
    f = request.files['file']
    filepath_eng = os.path.join(
        app.config['UPLOAD_FOLDER'], basename + '_junk.txt')
    f.save(filepath_eng)
    f_eng = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_junk.txt', 'r')
    for t in f_eng:
        Sentence.objects(source=t).delete()
    res = CustomResponse(Status.SUCCESS.value, None)
    return res.getres()
Exemple #3
0
 def __init__(self, model, title, lang):
     self.model = model
     self.sentences = list(
         filter(lambda x: len(x.words_list) > 0,
                (Sentence(sentence) for sentence in model.sentences)))
     self.title = title
     self.lang = lang
Exemple #4
0
def batchsentences():
    basename = request.args.get('basename')
    current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
    sentences = Sentence.objects(basename=basename)
    corpus_obj = Corpus.objects(basename=basename)
    index = 2
    batch_size = 10000
    if len(sentences) > batch_size:
        for i in range(2, 1 + math.ceil(len(sentences) / batch_size)):
            base = str(uuid.uuid4())
            if (i) * batch_size > len(sentences):
                sentence_batch = sentences[0:(i - 1) * batch_size - len(sentences)]
                print(len(sentence_batch))
                if len(sentence_batch) > 0:
                    corpus = Corpus(source_lang='English', target_lang='Hindi', status=STATUS_PROCESSED,
                                    name='SC Judgment 2019 Batch ' + str(index), domain='LAW', created_on=current_time,
                                    last_modified=current_time, author='', comment='',
                                    no_of_sentences=len(sentence_batch),
                                    basename=base)
                    corpus.save()

                    for sentence in sentence_batch:
                        sentence_dict = json.loads(sentence.to_json())
                        sen = Sentence.objects(_id=sentence_dict['_id']['$oid'])
                        print(sen.to_json())
                        sen.update(set__basename=base)
            else:
                sentence_batch = sentences[0:batch_size]
                print(len(sentence_batch))
                if len(sentence_batch) > 0:
                    corpus = Corpus(source_lang='English', target_lang='Hindi', status=STATUS_PROCESSED,
                                    name='SC Judgment 2019 Batch ' + str(index), domain='LAW', created_on=current_time,
                                    last_modified=current_time, author='', comment='',
                                    no_of_sentences=len(sentence_batch),
                                    basename=base)
                    corpus.save()
                    for sentence in sentence_batch:
                        sentence_dict = json.loads(sentence.to_json())
                        sen = Sentence.objects(_id=sentence_dict['_id']['$oid'])
                        print(sen.to_json())
                        sen.update(set__basename=base)
            index += 1
    res = CustomResponse(Status.FAILURE.value, basename)
    return res.getres()
Exemple #5
0
def process_files_law(basename, name):
    filtertext(app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin.txt',
               app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin_filtered.txt')
    filtertext(app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng.txt',
               app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_filtered.txt')
    processhindi(app.config['UPLOAD_FOLDER'] +
                 '/' + basename + '_hin_filtered.txt')
    processenglish(app.config['UPLOAD_FOLDER'] +
                   '/' + basename + '_eng_filtered.txt')
    translatewithgoogle(app.config['UPLOAD_FOLDER'] +
                        '/' + basename + '_hin_filtered.txt', app.config['UPLOAD_FOLDER'] +
                        '/' + basename + '_eng_tran.txt')
    os.system(
        './helpers/bleualign.py -s ' + os.getcwd() + '/upload/' + basename + '_hin_filtered' + '.txt' + ' -t ' + os.getcwd() + '/upload/' + basename +
        '_eng_filtered' + '.txt' + ' --srctotarget ' + os.getcwd() + '/upload/' + basename + '_eng_tran' + '.txt' + ' -o ' + os.getcwd() + '/upload/' + basename + '_output')
    english_res = []
    hindi_res = []
    english_points = []
    english_points_words = []
    hindi_points = []
    hindi_points_words = []
    f_eng = open(app.config['UPLOAD_FOLDER'] +
                 '/' + basename + '_output-t', 'r')
    for f in f_eng:
        english_res.append(f)
        point = fetchwordsfromsentence(f, basename)
        english_points.append(point['avg'])
        english_points_words.append(point['values'])
    f_eng.close()
    f_hin = open(app.config['UPLOAD_FOLDER'] +
                 '/' + basename + '_output-s', 'r')
    for f in f_hin:
        hindi_res.append(f)
        point = fetchwordsfromsentence(f, basename)
        hindi_points.append(point['avg'])
        hindi_points_words.append(point['values'])
    f_hin.close()
    data = {'hindi': hindi_res, 'english': english_res,
            'english_scores': english_points, 'hindi_scores': hindi_points}
    sentences = []
    for i in range(0, len(hindi_res)):
        sentence = Sentence(status=STATUS_PENDING, alignment_accuracy=english_res[i].split(':::::')[1], basename=name,
                            source=hindi_res[i], target=english_res[i].split(':::::')[0],
                            source_ocr_words=hindi_points_words[i], source_ocr=str(hindi_points[i]),
                            target_ocr_words=english_points_words[i], target_ocr=str(english_points[i]))
        sentences.append(sentence)
        # sentence.save()
    Sentence.objects.insert(sentences)
    for f in glob.glob(app.config['UPLOAD_FOLDER'] + '/' + basename + '*'):
        os.remove(f)
    res = CustomResponse(Status.SUCCESS.value, data)
    # corpus = Corpus.objects(basename=basename)
    # corpus.update(set__status=STATUS_PROCESSED,
    #               set__no_of_sentences=len(hindi_res))
    return res.getres()
Exemple #6
0
def update_sentences_status():
    body = request.get_json()
    if (body['sentences'] is None or not isinstance(body['sentences'], list)):
        res = CustomResponse(
            Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
        return res.getres(), Status.ERR_GLOBAL_MISSING_PARAMETERS.value['http']['status']
    for sentence in body['sentences']:
        corpus = Sentence.objects(_id=sentence['_id']['$oid'])
        corpus.update(set__status=sentence['status'])
    res = CustomResponse(Status.SUCCESS.value, None)
    return res.getres()
Exemple #7
0
def build_sentences(text, sequences):       
    sentences = []        
    for seq in sequences:     
            try:
                words = seq.split()                
                index = text.index(" ".join(words[:3]))                                
                lastWords = " ".join(words[len(words) - 3:])
                indexEnd = text.index(lastWords) + len(lastWords)                                        
                sentences.append(Sentence(seq, index, indexEnd))                    
            except:
                pass                                                            
    return sentences
Exemple #8
0
def update_sentences():
    body = request.get_json()
    if (body['sentences'] is None or not isinstance(body['sentences'], list)):
        res = CustomResponse(
            Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
        return res.getres(), Status.ERR_GLOBAL_MISSING_PARAMETERS.value['http']['status']
    for sentence in body['sentences']:
        corpus = Sentence.objects(_id=sentence['_id']['$oid'])
        corpus_dict = json.loads(corpus.to_json())
        sentence_log = Sentencelog(source_words=corpus_dict[0]['source'].split(" "),
                                   target_words=corpus_dict[0]['target'].split(" "),
                                   source_edited_words=sentence['source'].split(" "),
                                   updated_on=datetime.now(), edited_by=request.headers.get('ad-userid'),
                                   parent_id=sentence['_id']['$oid'], target_edited_words=sentence['target'].split(" "),
                                   basename=corpus_dict[0]['basename'], source=corpus_dict[0]['source'],
                                   target=corpus_dict[0]['target'], source_edited=sentence['source'],
                                   target_edited=sentence['target'])
        sentence_log.save()
        corpus.update(set__source=sentence['source'], set__target=sentence['target'], set__status=STATUS_EDITED)
    res = CustomResponse(Status.SUCCESS.value, None)
    return res.getres()
Exemple #9
0
def upload_indian_kannon_file():
    basename = str(int(time.time()))
    try:
        name = request.form.getlist('name')
        domain = request.form.getlist('domain')
        source_lang = request.form.getlist('source_lang')
        target_lang = request.form.getlist('target_lang')
        model_id = request.form.getlist('model_id')
        comment = request.form.getlist('comment')
        if comment is None or len(comment) == 0:
            comment = ['']
        if target_lang is None or len(target_lang) == 0 or len(target_lang[0]) == 0 or source_lang is None or len(
                source_lang) == 0 or len(source_lang[0]) == 0 or name is None or len(name) == 0 or len(
            name[0]) == 0 or domain is None or len(domain) == 0 or len(domain[0]) == 0 or request.files is None or \
                request.files['english'] is None:
            res = CustomResponse(
                Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
            return res.getres(), Status.ERR_GLOBAL_MISSING_PARAMETERS.value['http']['status']

        else:
            current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
            corpus = Corpus(source_lang=source_lang[0], target_lang=target_lang[0], status=STATUS_PROCESSING,
                            name=name[0], domain=domain[0], created_on=current_time,
                            last_modified=current_time, author='', comment=comment[0], no_of_sentences=0,
                            basename=basename)
            corpus.save()
            f_eng = request.files['english']
            filepath_eng = os.path.join(
                app.config['UPLOAD_FOLDER'], basename + '_eng_filtered.txt')
            f_eng.save(filepath_eng)
            # f = request.files['hindi']
            # filepath = os.path.join(
            #     app.config['UPLOAD_FOLDER'], basename + '_hin_filtered.txt')
            # f.save(filepath)

            translatewithanuvadaeng(app.config['UPLOAD_FOLDER'] +
                        '/'+basename+'_eng_filtered.txt', app.config['UPLOAD_FOLDER'] +
                        '/'+basename+'_hin_filtered.txt', model_id[0])
            # target_lang = LANGUAGES[target_lang[0]]
            # translatewithgoogle(app.config['UPLOAD_FOLDER'] +
            #             '/'+basename+'_eng_filtered.txt', app.config['UPLOAD_FOLDER'] +
            #             '/'+basename+'_hin_filtered.txt', target_lang)

            # os.system('./helpers/bleualign.py -s ' + os.getcwd() + '/upload/' + basename + '_hin_filtered' + '.txt' + ' -t ' + os.getcwd() + '/upload/' + basename +
            #         '_eng_filtered' + '.txt' + ' --srctotarget ' + os.getcwd() + '/upload/' + basename + '_eng_tran' + '.txt' + ' -o ' + os.getcwd() + '/upload/' + basename + '_output')
            english_res = []
            hindi_res = []
            f_eng = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_filtered.txt', 'r')
            for f in f_eng:
                english_res.append(f)
            f_eng.close()
            f_hin = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin_filtered.txt', 'r')
            for f in f_hin:
                hindi_res.append(f)
            f_hin.close()
            data = {'hindi': hindi_res, 'english': english_res}
            sentences = []
            for i in range(0, len(hindi_res)):
                sentence = Sentence(sentenceid=str(uuid.uuid4()), status=STATUS_PENDING, basename=str(
                    basename), source=english_res[i], target=hindi_res[i])
                sentences.append(sentence)
                # sentence.save()
            Sentence.objects.insert(sentences)
            for f in glob.glob(app.config['UPLOAD_FOLDER'] + '/' + basename + '*'):
                os.remove(f)
            res = CustomResponse(Status.SUCCESS.value, data)
            corpus = Corpus.objects(basename=basename)
            corpus.update(set__status=STATUS_PROCESSED,
                          set__no_of_sentences=len(hindi_res))
            return res.getres()
    except Exception as e:
        print(e)
        res = CustomResponse(Status.ERR_GLOBAL_SYSTEM.value, None)
        return res.getres(), Status.ERR_GLOBAL_SYSTEM.value['http']['status']
Exemple #10
0
def upload_benchmark_file():
    basename = str(int(time.time()))
    assign_to = ''
    if request.headers.get('ad-userid') is not None:
        assign_to
    try:
        name = request.form.getlist('name')
        source_lang = request.form.getlist('source_lang')
        if source_lang is None or len(
                source_lang) == 0 or len(source_lang[0]) == 0 or name is None or len(name) == 0 or len(
            name[0]) == 0 or request.files is None or \
                request.files['file'] is None:
            res = CustomResponse(
                Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
            return res.getres(), Status.ERR_GLOBAL_MISSING_PARAMETERS.value['http']['status']

        else:
            current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
            corpus = Benchmark(source_lang=source_lang[0], status=STATUS_PROCESSING,
                            name=name[0], created_on=current_time,assigned_to=request.headers.get('ad-userid'),
                            last_modified=current_time, author='', no_of_sentences=0,
                            basename=basename)
            corpus.save()
            f_eng = request.files['file']
            filepath_eng = os.path.join(
                app.config['UPLOAD_FOLDER'], basename + '_eng_filtered.txt')
            f_eng.save(filepath_eng)
            # f = request.files['hindi']
            # filepath = os.path.join(
            #     app.config['UPLOAD_FOLDER'], basename + '_hin_filtered.txt')
            # f.save(filepath)

            # translatewithanuvadaeng(app.config['UPLOAD_FOLDER'] +
            #             '/'+basename+'_eng_filtered.txt', app.config['UPLOAD_FOLDER'] +
            #             '/'+basename+'_hin_filtered.txt', model_id[0])
            # target_lang = LANGUAGES[target_lang[0]]
            # translatewithgoogle(app.config['UPLOAD_FOLDER'] +
            #             '/'+basename+'_eng_filtered.txt', app.config['UPLOAD_FOLDER'] +
            #             '/'+basename+'_hin_filtered.txt', target_lang)

            # os.system('./helpers/bleualign.py -s ' + os.getcwd() + '/upload/' + basename + '_hin_filtered' + '.txt' + ' -t ' + os.getcwd() + '/upload/' + basename +
            #         '_eng_filtered' + '.txt' + ' --srctotarget ' + os.getcwd() + '/upload/' + basename + '_eng_tran' + '.txt' + ' -o ' + os.getcwd() + '/upload/' + basename + '_output')
            english_res = []
            # f_eng = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_filtered.txt', 'r')
            error = False
            error_messages = 'Error came for Sentences'
            with open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_filtered.txt', 'rb') as f:
            # for f in f_eng:
                flist = f.readlines()
                index = 1
                for f_data in flist:
                    try:
                        if f_data.decode("utf8") != '\n' and len(f_data.decode("utf8")) > 0:
                            index = index + 1
                            english_res.append(f_data.decode("utf8"))
                    except Exception as e:
                        error = True
                        error_messages = error_messages +' '+str(index)
                        index = index + 1
            # f_eng.close()
            data = {'english': english_res}
            sentences = []
            for i in range(0, len(english_res)):
                sentence = Sentence(sentenceid=str(uuid.uuid4()), status=STATUS_PENDING, basename=str(
                    basename), source=english_res[i])
                try:
                    sentence.save()
                except Exception as e:
                    error = True
                    error_messages = error_messages+' '+english_res[i]
                # sentences.append(sentence)
                # sentence.save()
            # Sentence.objects.insert(sentences)
            for f in glob.glob(app.config['UPLOAD_FOLDER'] + '/' + basename + '*'):
                os.remove(f)
            res = None
            log.info(error)
            if error:
                res = {}
                res = Status.ERR_GLOBAL_SYSTEM.value
                res['why'] = error_messages
                # res = CustomResponse(Status.ERR_GLOBAL_SYSTEM.value, error_messages)
                return jsonify(res),500
            else:
                res = CustomResponse(Status.SUCCESS.value, data)
            corpus = Benchmark.objects(basename=basename)
            corpus.update(set__status=STATUS_PROCESSED,
                          set__no_of_sentences=len(english_res))
            return res.getres()
    except Exception as e:
        print(e)
        res = CustomResponse(Status.ERR_GLOBAL_SYSTEM.value, None)
        return res.getres(), Status.ERR_GLOBAL_SYSTEM.value['http']['status']
Exemple #11
0
def fetch_sentences():
    basename = request.args.get('basename')
    sentences = Sentence.objects(basename=basename).to_json()
    res = CustomResponse(Status.SUCCESS.value, json.loads(sentences))
    return res.getres()
 def parse_sentence(self, text):
     return Sentence(self.pipeline(text))