def segment_text_job(json_dic,pickle_path,filepath,which): ''' split a text in sentences. json_dic: json object which: which part of json the text come from. ''' print('Starting task segment') job = init_job('nltk-seg') result = __segment_text(json_dic[which]['source'],pickle_path) # TODO change ! remove save as well? json_dic[which]['segments'] = create_segments(result) if filepath != '': save_json(json_dic,filepath) job.meta['doSave'] = '1' else: job.meta['output'] = json_dic[which]['segments'] job.meta['type'] = 'onesegment' job.meta['which'] = which job.meta['progress'] = 100 job.save_meta() return result
def maligna_seg_job(text1, text2): job = init_job('maligna-seg') cmd, cmd2 = cmd_2_split_in_sentences() result = execute_short_double(cmd, cmd2, text2xmlstring(text1, text2)) job.meta['output'] = results2segment_lists(result) job.meta['type'] = 'twosegments' job.save_meta() print('Task completed')
def maligna_align_job(jsonData): job = init_job('maligna-align') cmd = cmd_2_align_sentences() result = execute_short_single(cmd, json2xmlstring(jsonData)) #job.meta['output'] = results2segment_lists(result) res = compare_results(jsonData, result) job.meta['output'] = res job.meta['type'] = 'align' job.save_meta() print('Task completed')
def split_in_sentences_with_model(json_dic,which,language): print('Starting') job = init_job('spacy-seg') raw_text = json_dic[which]['source'] nlp = __load_spacy(language) doc = nlp(raw_text) sentences = [sent.string.strip() for sent in doc.sents] json_dic[which]['segments'] = create_segments(sentences) job.meta['output'] = json_dic[which]['segments'] job.meta['type'] = 'onesegment' job.meta['which'] = which job.meta['progress'] = 100 job.save_meta() return sentences # why?
def split_in_sentences(json_dic,which): print('Starting') job = init_job('spacy-seg') raw_text = json_dic[which]['source'] nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) doc = nlp(raw_text) sentences = [sent.string.strip() for sent in doc.sents] json_dic[which]['segments'] = create_segments(sentences) job.meta['output'] = json_dic[which]['segments'] job.meta['type'] = 'onesegment' job.meta['which'] = which job.meta['progress'] = 100 job.save_meta() return sentences # why?