def fetch_sentences(): global LANGUAGES basename = request.args.get('basename') totalcount = 0 (sentencesobj, totalcount) = Sentence.limit(request.args.get('pagesize'), basename, request.args.get('status'), request.args.get('pageno')) corpus_obj = Corpus.objects(basename=basename) corpus_dict = json.loads(corpus_obj.to_json()) sentences_list = [] sources = [] if sentencesobj is not None: for sent in sentencesobj: sent_dict = json.loads(sent.to_json()) corpus = Sentence.objects(_id=sent_dict['_id']['$oid']) if sent_dict['status'] == STATUS_PENDING: corpus.update(set__status=STATUS_PROCESSING) sources.append(sent_dict['source']) target_lang = 'en' if 'target_lang' in corpus_dict[0] and corpus_dict[0]['target_lang'] is not None: target_lang = LANGUAGES[corpus_dict[0]['target_lang']] translation_list = translatesinglesentence(sources, target_lang) index = 0 for sent in sentencesobj: sent_dict = json.loads(sent.to_json()) sent_dict['translation'] = translation_list[index] sentences_list.append(sent_dict) index += 1 # print() # for sentence in sentencesobj: # # sentence.update(set__status=STATUS_PROCESSING, set__locked=True, set__locked_time=datetime.now()) # sentence.update(set__status=STATUS_PROCESSING) res = CustomResponse(Status.SUCCESS.value, sentences_list, totalcount) return res.getres()
def remove_junk(): basename = str(int(time.time())) f = request.files['file'] filepath_eng = os.path.join( app.config['UPLOAD_FOLDER'], basename + '_junk.txt') f.save(filepath_eng) f_eng = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_junk.txt', 'r') for t in f_eng: Sentence.objects(source=t).delete() res = CustomResponse(Status.SUCCESS.value, None) return res.getres()
def __init__(self, model, title, lang): self.model = model self.sentences = list( filter(lambda x: len(x.words_list) > 0, (Sentence(sentence) for sentence in model.sentences))) self.title = title self.lang = lang
def batchsentences(): basename = request.args.get('basename') current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") sentences = Sentence.objects(basename=basename) corpus_obj = Corpus.objects(basename=basename) index = 2 batch_size = 10000 if len(sentences) > batch_size: for i in range(2, 1 + math.ceil(len(sentences) / batch_size)): base = str(uuid.uuid4()) if (i) * batch_size > len(sentences): sentence_batch = sentences[0:(i - 1) * batch_size - len(sentences)] print(len(sentence_batch)) if len(sentence_batch) > 0: corpus = Corpus(source_lang='English', target_lang='Hindi', status=STATUS_PROCESSED, name='SC Judgment 2019 Batch ' + str(index), domain='LAW', created_on=current_time, last_modified=current_time, author='', comment='', no_of_sentences=len(sentence_batch), basename=base) corpus.save() for sentence in sentence_batch: sentence_dict = json.loads(sentence.to_json()) sen = Sentence.objects(_id=sentence_dict['_id']['$oid']) print(sen.to_json()) sen.update(set__basename=base) else: sentence_batch = sentences[0:batch_size] print(len(sentence_batch)) if len(sentence_batch) > 0: corpus = Corpus(source_lang='English', target_lang='Hindi', status=STATUS_PROCESSED, name='SC Judgment 2019 Batch ' + str(index), domain='LAW', created_on=current_time, last_modified=current_time, author='', comment='', no_of_sentences=len(sentence_batch), basename=base) corpus.save() for sentence in sentence_batch: sentence_dict = json.loads(sentence.to_json()) sen = Sentence.objects(_id=sentence_dict['_id']['$oid']) print(sen.to_json()) sen.update(set__basename=base) index += 1 res = CustomResponse(Status.FAILURE.value, basename) return res.getres()
def process_files_law(basename, name): filtertext(app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin.txt', app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin_filtered.txt') filtertext(app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng.txt', app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_filtered.txt') processhindi(app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin_filtered.txt') processenglish(app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_filtered.txt') translatewithgoogle(app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin_filtered.txt', app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_tran.txt') os.system( './helpers/bleualign.py -s ' + os.getcwd() + '/upload/' + basename + '_hin_filtered' + '.txt' + ' -t ' + os.getcwd() + '/upload/' + basename + '_eng_filtered' + '.txt' + ' --srctotarget ' + os.getcwd() + '/upload/' + basename + '_eng_tran' + '.txt' + ' -o ' + os.getcwd() + '/upload/' + basename + '_output') english_res = [] hindi_res = [] english_points = [] english_points_words = [] hindi_points = [] hindi_points_words = [] f_eng = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_output-t', 'r') for f in f_eng: english_res.append(f) point = fetchwordsfromsentence(f, basename) english_points.append(point['avg']) english_points_words.append(point['values']) f_eng.close() f_hin = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_output-s', 'r') for f in f_hin: hindi_res.append(f) point = fetchwordsfromsentence(f, basename) hindi_points.append(point['avg']) hindi_points_words.append(point['values']) f_hin.close() data = {'hindi': hindi_res, 'english': english_res, 'english_scores': english_points, 'hindi_scores': hindi_points} sentences = [] for i in range(0, len(hindi_res)): sentence = Sentence(status=STATUS_PENDING, alignment_accuracy=english_res[i].split(':::::')[1], basename=name, source=hindi_res[i], target=english_res[i].split(':::::')[0], source_ocr_words=hindi_points_words[i], source_ocr=str(hindi_points[i]), target_ocr_words=english_points_words[i], target_ocr=str(english_points[i])) sentences.append(sentence) # sentence.save() Sentence.objects.insert(sentences) for f in glob.glob(app.config['UPLOAD_FOLDER'] + '/' + basename + '*'): os.remove(f) res = CustomResponse(Status.SUCCESS.value, data) # corpus = Corpus.objects(basename=basename) # corpus.update(set__status=STATUS_PROCESSED, # set__no_of_sentences=len(hindi_res)) return res.getres()
def update_sentences_status(): body = request.get_json() if (body['sentences'] is None or not isinstance(body['sentences'], list)): res = CustomResponse( Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getres(), Status.ERR_GLOBAL_MISSING_PARAMETERS.value['http']['status'] for sentence in body['sentences']: corpus = Sentence.objects(_id=sentence['_id']['$oid']) corpus.update(set__status=sentence['status']) res = CustomResponse(Status.SUCCESS.value, None) return res.getres()
def build_sentences(text, sequences): sentences = [] for seq in sequences: try: words = seq.split() index = text.index(" ".join(words[:3])) lastWords = " ".join(words[len(words) - 3:]) indexEnd = text.index(lastWords) + len(lastWords) sentences.append(Sentence(seq, index, indexEnd)) except: pass return sentences
def update_sentences(): body = request.get_json() if (body['sentences'] is None or not isinstance(body['sentences'], list)): res = CustomResponse( Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getres(), Status.ERR_GLOBAL_MISSING_PARAMETERS.value['http']['status'] for sentence in body['sentences']: corpus = Sentence.objects(_id=sentence['_id']['$oid']) corpus_dict = json.loads(corpus.to_json()) sentence_log = Sentencelog(source_words=corpus_dict[0]['source'].split(" "), target_words=corpus_dict[0]['target'].split(" "), source_edited_words=sentence['source'].split(" "), updated_on=datetime.now(), edited_by=request.headers.get('ad-userid'), parent_id=sentence['_id']['$oid'], target_edited_words=sentence['target'].split(" "), basename=corpus_dict[0]['basename'], source=corpus_dict[0]['source'], target=corpus_dict[0]['target'], source_edited=sentence['source'], target_edited=sentence['target']) sentence_log.save() corpus.update(set__source=sentence['source'], set__target=sentence['target'], set__status=STATUS_EDITED) res = CustomResponse(Status.SUCCESS.value, None) return res.getres()
def upload_indian_kannon_file(): basename = str(int(time.time())) try: name = request.form.getlist('name') domain = request.form.getlist('domain') source_lang = request.form.getlist('source_lang') target_lang = request.form.getlist('target_lang') model_id = request.form.getlist('model_id') comment = request.form.getlist('comment') if comment is None or len(comment) == 0: comment = [''] if target_lang is None or len(target_lang) == 0 or len(target_lang[0]) == 0 or source_lang is None or len( source_lang) == 0 or len(source_lang[0]) == 0 or name is None or len(name) == 0 or len( name[0]) == 0 or domain is None or len(domain) == 0 or len(domain[0]) == 0 or request.files is None or \ request.files['english'] is None: res = CustomResponse( Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getres(), Status.ERR_GLOBAL_MISSING_PARAMETERS.value['http']['status'] else: current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") corpus = Corpus(source_lang=source_lang[0], target_lang=target_lang[0], status=STATUS_PROCESSING, name=name[0], domain=domain[0], created_on=current_time, last_modified=current_time, author='', comment=comment[0], no_of_sentences=0, basename=basename) corpus.save() f_eng = request.files['english'] filepath_eng = os.path.join( app.config['UPLOAD_FOLDER'], basename + '_eng_filtered.txt') f_eng.save(filepath_eng) # f = request.files['hindi'] # filepath = os.path.join( # app.config['UPLOAD_FOLDER'], basename + '_hin_filtered.txt') # f.save(filepath) translatewithanuvadaeng(app.config['UPLOAD_FOLDER'] + '/'+basename+'_eng_filtered.txt', app.config['UPLOAD_FOLDER'] + '/'+basename+'_hin_filtered.txt', model_id[0]) # target_lang = LANGUAGES[target_lang[0]] # translatewithgoogle(app.config['UPLOAD_FOLDER'] + # '/'+basename+'_eng_filtered.txt', app.config['UPLOAD_FOLDER'] + # '/'+basename+'_hin_filtered.txt', target_lang) # os.system('./helpers/bleualign.py -s ' + os.getcwd() + '/upload/' + basename + '_hin_filtered' + '.txt' + ' -t ' + os.getcwd() + '/upload/' + basename + # '_eng_filtered' + '.txt' + ' --srctotarget ' + os.getcwd() + '/upload/' + basename + '_eng_tran' + '.txt' + ' -o ' + os.getcwd() + '/upload/' + basename + '_output') english_res = [] hindi_res = [] f_eng = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_filtered.txt', 'r') for f in f_eng: english_res.append(f) f_eng.close() f_hin = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_hin_filtered.txt', 'r') for f in f_hin: hindi_res.append(f) f_hin.close() data = {'hindi': hindi_res, 'english': english_res} sentences = [] for i in range(0, len(hindi_res)): sentence = Sentence(sentenceid=str(uuid.uuid4()), status=STATUS_PENDING, basename=str( basename), source=english_res[i], target=hindi_res[i]) sentences.append(sentence) # sentence.save() Sentence.objects.insert(sentences) for f in glob.glob(app.config['UPLOAD_FOLDER'] + '/' + basename + '*'): os.remove(f) res = CustomResponse(Status.SUCCESS.value, data) corpus = Corpus.objects(basename=basename) corpus.update(set__status=STATUS_PROCESSED, set__no_of_sentences=len(hindi_res)) return res.getres() except Exception as e: print(e) res = CustomResponse(Status.ERR_GLOBAL_SYSTEM.value, None) return res.getres(), Status.ERR_GLOBAL_SYSTEM.value['http']['status']
def upload_benchmark_file(): basename = str(int(time.time())) assign_to = '' if request.headers.get('ad-userid') is not None: assign_to try: name = request.form.getlist('name') source_lang = request.form.getlist('source_lang') if source_lang is None or len( source_lang) == 0 or len(source_lang[0]) == 0 or name is None or len(name) == 0 or len( name[0]) == 0 or request.files is None or \ request.files['file'] is None: res = CustomResponse( Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getres(), Status.ERR_GLOBAL_MISSING_PARAMETERS.value['http']['status'] else: current_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") corpus = Benchmark(source_lang=source_lang[0], status=STATUS_PROCESSING, name=name[0], created_on=current_time,assigned_to=request.headers.get('ad-userid'), last_modified=current_time, author='', no_of_sentences=0, basename=basename) corpus.save() f_eng = request.files['file'] filepath_eng = os.path.join( app.config['UPLOAD_FOLDER'], basename + '_eng_filtered.txt') f_eng.save(filepath_eng) # f = request.files['hindi'] # filepath = os.path.join( # app.config['UPLOAD_FOLDER'], basename + '_hin_filtered.txt') # f.save(filepath) # translatewithanuvadaeng(app.config['UPLOAD_FOLDER'] + # '/'+basename+'_eng_filtered.txt', app.config['UPLOAD_FOLDER'] + # '/'+basename+'_hin_filtered.txt', model_id[0]) # target_lang = LANGUAGES[target_lang[0]] # translatewithgoogle(app.config['UPLOAD_FOLDER'] + # '/'+basename+'_eng_filtered.txt', app.config['UPLOAD_FOLDER'] + # '/'+basename+'_hin_filtered.txt', target_lang) # os.system('./helpers/bleualign.py -s ' + os.getcwd() + '/upload/' + basename + '_hin_filtered' + '.txt' + ' -t ' + os.getcwd() + '/upload/' + basename + # '_eng_filtered' + '.txt' + ' --srctotarget ' + os.getcwd() + '/upload/' + basename + '_eng_tran' + '.txt' + ' -o ' + os.getcwd() + '/upload/' + basename + '_output') english_res = [] # f_eng = open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_filtered.txt', 'r') error = False error_messages = 'Error came for Sentences' with open(app.config['UPLOAD_FOLDER'] + '/' + basename + '_eng_filtered.txt', 'rb') as f: # for f in f_eng: flist = f.readlines() index = 1 for f_data in flist: try: if f_data.decode("utf8") != '\n' and len(f_data.decode("utf8")) > 0: index = index + 1 english_res.append(f_data.decode("utf8")) except Exception as e: error = True error_messages = error_messages +' '+str(index) index = index + 1 # f_eng.close() data = {'english': english_res} sentences = [] for i in range(0, len(english_res)): sentence = Sentence(sentenceid=str(uuid.uuid4()), status=STATUS_PENDING, basename=str( basename), source=english_res[i]) try: sentence.save() except Exception as e: error = True error_messages = error_messages+' '+english_res[i] # sentences.append(sentence) # sentence.save() # Sentence.objects.insert(sentences) for f in glob.glob(app.config['UPLOAD_FOLDER'] + '/' + basename + '*'): os.remove(f) res = None log.info(error) if error: res = {} res = Status.ERR_GLOBAL_SYSTEM.value res['why'] = error_messages # res = CustomResponse(Status.ERR_GLOBAL_SYSTEM.value, error_messages) return jsonify(res),500 else: res = CustomResponse(Status.SUCCESS.value, data) corpus = Benchmark.objects(basename=basename) corpus.update(set__status=STATUS_PROCESSED, set__no_of_sentences=len(english_res)) return res.getres() except Exception as e: print(e) res = CustomResponse(Status.ERR_GLOBAL_SYSTEM.value, None) return res.getres(), Status.ERR_GLOBAL_SYSTEM.value['http']['status']
def fetch_sentences(): basename = request.args.get('basename') sentences = Sentence.objects(basename=basename).to_json() res = CustomResponse(Status.SUCCESS.value, json.loads(sentences)) return res.getres()
def parse_sentence(self, text): return Sentence(self.pipeline(text))