def run(console, build, title, doc_ids=None, max_documents=-1, force=False): """ Get case information from HUDOC :param build: build path :type string :param: max_documents: maximal number of documents to retrieve :type: int :param: force: delete and recreate the folder :type: bool """ __console = console global print print = __console.print print(Markdown("- **Step configuration**")) output_folder = os.path.join(build, 'raw', 'raw_cases_info') print(TAB + '> Step folder: {}'.format(output_folder)) make_build_folder(console, output_folder, force, strict=False) print(Markdown("- **Determining the number cases**")) if doc_ids: _, max_documents = determine_max_documents(BASE_URL, 144579) print(TAB + "> Doc ids given") else: if max_documents == -1: print(TAB + "> The total number of documents is not provided") with Progress( TextColumn(TAB + "> Querying HUDOC...", justify="right"), StatusColumn({ None: '[IN PROGRESS]', 0: '[green] [DONE]', 1: '[red] [FAILED]' }), transient=True, console=console ) as progress: task = progress.add_task("Get total number of documents") while not progress.finished: rc, max_documents = determine_max_documents(BASE_URL, 144579) # v1.0.0 value progress.update(task, rc=rc) print(TAB + "> The total number of documents to retrieve: {}".format(max_documents)) print(Markdown("- **Get case information from HUDOC**")) get_case_info(console, BASE_URL, max_documents, output_folder)
def run(console, build, title, doc_ids=None, force=False, update=False): __console = console global print print = __console.print print(Markdown("- **Step configuration**")) input_file = os.path.join(build, 'raw', 'cases_info', 'raw_cases_info_all.json') output_folder = os.path.join(build, 'raw', 'judgments') print(TAB + '> Step folder: {}'.format(os.path.join(build, 'raw', 'judgments'))) make_build_folder(console, output_folder, force, strict=False) id_list = [] try: with open(input_file, 'r') as f: content = f.read() cases = json.loads(content) id_list = [(i['itemid'], i["application"].startswith("MS WORD")) for i in cases] except Exception as e: print(e) return print(Markdown("- **Get documents from HUDOC**")) if doc_ids: id_list, in_build, not_in_build = get_files(doc_ids, id_list) if len(not_in_build): print(TAB + '> Failed to download documents: {} '.format(not_in_build)) if len(id_list): print(TAB + '> Documenents: {} downloaded from HUDOC'.format(in_build)) else: print(TAB + "> [red] No documents to download") return get_documents(console, id_list, output_folder, update, force)
def run(console, build, title, limit_tokens, doc_ids=None, processed_folder='all', force=False, update=False): __console = console global print print = __console.print input_file = os.path.join( build, 'raw', 'cases_info', 'raw_cases_info_{}.json'.format(processed_folder)) input_folder = os.path.join(build, 'raw', 'normalized_documents') output_folder = os.path.join(build, 'structured') output_folder_tfidf = os.path.join(output_folder, 'tfidf') output_folder_bow = os.path.join(output_folder, 'bow') print(Markdown("- **Step configuration**")) print(TAB + '> Step folder: {}'.format(output_folder_tfidf)) make_build_folder(console, output_folder_tfidf, force, strict=False) print(TAB + '> Step folder: {}'.format(output_folder_bow)) make_build_folder(console, output_folder_bow, force, strict=False) try: config()['steps']['normalize']['ngrams'] except Exception as e: print('Cannot retrieve n-grams configuration. Details: {}'.format(e)) exit(5) print(TAB + '> Read configuration [green][DONE]') cases_index = {} with open(input_file, 'r') as f: content = f.read() cases = json.loads(content) cases_index = {c['itemid']: i for i, c in enumerate(cases)} f.close() files = get_files(doc_ids, input_folder, cases_index) raw_corpus = [] corpus_id = [] print(Markdown('- **Create dictionary**')) with Progress( TAB + "> Loading in memory... [IN PROGRESS]", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console) as progress: task = progress.add_task( "Loading...", total=len(files), error="", doc=files[0].split('/')[-1].split('_normalized.txt')[0]) for i, p in enumerate(files): error = "" try: doc_id = p.split('/')[-1].split('_normalized.txt')[0] raw_corpus.append(load_text_file(p).split()) corpus_id.append(doc_id) except Exception as e: error = '\n| {}'.format('Could not load the document') log.debug(p, e) progress.update(task, advance=1, error=error, doc=doc_id) print(TAB + "> Loading in memory... [green][DONE]") # data = json.load(open('./full_dictionary.txt')) f = [t for doc in raw_corpus for t in doc] f = Counter(f) # Load the raw dictionary f = f.most_common(int(limit_tokens)) words = [w[0] for w in f] # dictionary = corpora.Dictionary([all_grams]) print(TAB + '> Create dictionary') dictionary = corpora.Dictionary([words]) dictionary.save(os.path.join(output_folder, 'dictionary.dict')) with open(os.path.join(output_folder, 'feature_to_id.dict'), 'w') as outfile: json.dump(dictionary.token2id, outfile, indent=4, sort_keys=True) corpus = [dictionary.doc2bow(text) for text in raw_corpus] print(Markdown('- **Create language models**')) with Progress( TAB + "> Create Bag of Word... [IN PROGRESS]", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console) as progress: task = progress.add_task("Loading...", total=len(corpus), error="", doc=corpus_id[0]) for i, doc in enumerate(corpus): error = "" filename = os.path.join(output_folder_bow, '{}_bow.txt'.format(corpus_id[i])) # if update and not os.path.isfile(filename): with open(filename, 'w') as file: for f, v in doc: file.write('{}:{} '.format(f, v)) progress.update(task, advance=1, error=error, doc=corpus_id[i]) print(TAB + "> Create Bag of Word... [green][DONE]") tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] with Progress( TAB + "> Create TF-IDF... [IN PROGRESS]", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console) as progress: task = progress.add_task("Loading...", total=len(corpus_tfidf), error="", doc=corpus_id[0]) for i, doc in enumerate(corpus_tfidf): error = "" with open( os.path.join(output_folder_tfidf, '{}_tfidf.txt'.format(corpus_id[i])), 'w') as file: for f, v in doc: file.write('{}:{} '.format(f, v)) progress.update(task, advance=1, error=error, doc=corpus_id[i]) print(TAB + "> Create TF-IDF... [green][DONE]")
def run(console, build, title, doc_ids=None, output_prefix='cases', force=False): __console = console global print print = __console.print print(Markdown("- **Step configuration**")) print(TAB + "> Prepare release folder structure") paths = ['unstructured', 'structured', 'raw'] for p in paths: make_build_folder(console, os.path.join(build, p), force, strict=False) print(Markdown("- **Normalize database**")) input_folder = os.path.join(build, 'raw', 'preprocessed_documents') start = time.perf_counter() cases_files = get_files(doc_ids, input_folder) stop = time.perf_counter() print(TAB + "> Prepare unstructured cases in {:0.4f}s [green][DONE]".format( stop - start)) # Unstructured start = time.perf_counter() with open(os.path.join(build, 'unstructured', 'cases.json'), 'w') as outfile: outfile.write('[\n') for i, f in enumerate(cases_files): with open(f) as json_file: data = json.load(json_file) json.dump(data, outfile, indent=4) if i != len(cases_files) - 1: outfile.write(',\n') outfile.write('\n]') stop = time.perf_counter() # Structured print(TAB + "> Generate flat cases in {:0.4f}s [green][DONE]".format(stop - start)) start = time.perf_counter() flat_cases, representatives, extractedapp, scl, decision_body = format_structured_json( cases_files) stop = time.perf_counter() print(TAB + "> Flat cases size: {}MiB in {:0.4f}s".format( sys.getsizeof(flat_cases) / 1000, stop - start)) schema_hints = { 'article': { 'col_type': COL_HINT.HOT_ONE }, 'documentcollectionid': { 'col_type': COL_HINT.HOT_ONE }, 'applicability': { 'col_type': COL_HINT.HOT_ONE }, 'paragraphs': { 'col_type': COL_HINT.HOT_ONE }, 'decision_body': { 'col_type': COL_HINT.HOT_ONE }, 'conclusion': { 'col_type': COL_HINT.HOT_ONE, 'sub_element': 'flatten' } } output_path = os.path.join(build, 'structured') with open(os.path.join(output_path, 'flat_cases.json'), 'w') as outfile: json.dump(flat_cases, outfile, indent=4) with open(os.path.join(output_path, 'schema_hint.json'), 'w') as outfile: json.dump(schema_hints, outfile, indent=4) X = flat_cases start = time.perf_counter() df, schema, flat_schema, flat_type_mapping, flat_domain_mapping = normalize( X, schema_hints) df.to_json(os.path.join(output_path, '{}.json'.format(output_prefix)), orient='records') df.to_csv(os.path.join(output_path, '{}.csv'.format(output_prefix))) json_files = [('schema', schema.to_schema()), ('flat_schema', flat_schema.as_dict()), ('flat_type_mapping', flat_type_mapping), ('flat_domain_mapping', flat_domain_mapping)] for f in json_files: with open( os.path.join(output_path, '{}_{}.json'.format(output_prefix, f[0])), 'w') as outfile: json.dump(f[1], outfile, indent=4) os.remove(os.path.join(output_path, 'flat_cases.json')) os.remove(os.path.join(output_path, 'cases_flat_schema.json')) os.remove(os.path.join(output_path, 'cases_flat_type_mapping.json')) stop = time.perf_counter() print(TAB + '> Generate appnos matrice in {:0.4f}s [green][DONE]'.format(stop - start)) matrice_appnos = {} for k, v in extractedapp.items(): matrice_appnos[k] = {e: 1 for e in v['appnos']} with open(os.path.join(output_path, 'matrice_appnos.json'), 'w') as outfile: json.dump(matrice_appnos, outfile, indent=4) print(TAB + '> Generate scl matrice [green][DONE]') matrice_scl = {} for k, v in scl.items(): matrice_scl[k] = {e: 1 for e in v['scl']} with open(os.path.join(output_path, 'matrice_scl.json'), 'w') as outfile: json.dump(matrice_scl, outfile, indent=4) print(TAB + '> Generate representatives matrice [green][DONE]') matrice_representedby = {} for k, v in representatives.items(): matrice_representedby[k] = {e: 1 for e in v['representedby']} with open(os.path.join(output_path, 'matrice_representatives.json'), 'w') as outfile: json.dump(matrice_representedby, outfile, indent=4) print(TAB + '> Generate decision body matrice [green][DONE]') matrice_decision_body = {} for k, v in decision_body.items(): matrice_decision_body[k] = {k: v for k, v in v['role'].items()} with open(os.path.join(output_path, 'matrice_decision_body.json'), 'w') as outfile: json.dump(matrice_decision_body, outfile, indent=4) print(TAB + '> Create archives [green][DONE]') # Raw shutil.make_archive(os.path.join(build, 'raw', 'judgments'), 'zip', os.path.join(build, 'raw', 'judgments')) # All from zipfile import ZipFile with ZipFile(os.path.join(build, 'all.zip'), 'w') as zipObj: # Iterate over all the files in directory folders = ['unstructured', 'raw', 'structured'] for f in folders: for folderName, _, filenames in os.walk(os.path.join(build, f)): for filename in filenames: if not filename.endswith('.zip'): filePath = os.path.join(folderName, filename) zipObj.write(filePath)
def run(console, build, title, doc_ids=None, force=False, update=False): __console = console global print print = __console.print print(Markdown("- **Step configuration**")) input_file = os.path.join(build, 'raw', 'cases_info', 'raw_cases_info_all.json') input_folder = os.path.join(build, 'raw', 'judgments') output_folder = os.path.join(build, 'raw', 'preprocessed_documents') print(TAB + '> Step folder: {}'.format(output_folder)) make_build_folder(console, output_folder, force, strict=False) stats = {'parser_type': {'OLD': 0, 'NEW': 0}} with open(input_file, 'r') as f: content = f.read() cases = json.loads(content) if doc_ids: cases_index = { c['itemid']: i for i, c in enumerate(cases) if c['itemid'] in doc_ids } else: cases_index = {c['itemid']: i for i, c in enumerate(cases)} f.close() correctly_parsed = 0 failed = [] files = get_files(doc_ids, input_folder) decision_body_not_parsed = [] print(Markdown('- **Preprocess documents**')) with Progress( TAB + "> Preprocess documents... [IN PROGRESS]\n", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console) as progress: task = progress.add_task("Preprocessing...", total=len(files), error="", doc=files[0].split('/')[-1].split('.')[0]) for _, p in enumerate(files): error = "" id_doc = p.split('/')[-1].split('.')[0] filename_parsed = os.path.join(output_folder, '{}_parsed.json'.format(id_doc)) if not update or not os.path.isfile(filename_parsed): try: p_ = update_docx(p) doc = Document(p_) parser = select_parser(doc) stats['parser_type'][parser] += 1 if parser == 'NEW': parsed, attachments, db_not_parsed = parse_document( doc, id_doc, build) decision_body_not_parsed.extend(db_not_parsed) parsed.update(cases[cases_index[id_doc]]) with open( os.path.join( output_folder, '{}_text_without_conclusion.txt'.format( id_doc)), 'w') as toutfile: toutfile.write( json_to_text(parsed, text_only=True, except_section=['conclusion'], attachments=attachments)) parsed['documents'] = ['{}.docx'.format(id_doc)] parsed['content'] = { '{}.docx'.format(id_doc): parsed['elements'] } parsed['attachments'] = { '{}.docx'.format(id_doc): attachments } del parsed['elements'] with open(filename_parsed, 'w') as outfile: json.dump(parsed, outfile, indent=4, sort_keys=True) correctly_parsed += 1 else: raise Exception("OLD parser is not available yet.") except Exception as e: # __console.print_exception() failed.append((id_doc, e)) error = "\n| Could not preprocess {}".format(id_doc) error += "\n| {}".format(e) log.debug("{} {}".format(p, e)) else: error = '\n| Skip document because it is already processed' correctly_parsed += 1 progress.update(task, advance=1, error=error, doc=id_doc) if correctly_parsed == len(files): print(TAB + "> Preprocess documents... [green][DONE]") else: print(TAB + "> Preprocess documents... [yellow][WARNING]") print( TAB + "[bold yellow]:warning: Some documents could not be preprocessed") print(TAB + " [bold yellow]THE FINAL DATABASE WILL BE INCOMPLETE!") print(TAB + '> Correctly parsed: {}/{} ({:.4f}%)'.format( correctly_parsed, len(files), (100. * correctly_parsed) / len(files))) if correctly_parsed != len(files): print(TAB + '> List of failed documents:') table = Table() table.add_column("Case ID", style="cyan", no_wrap=True) table.add_column("Error", justify="left", style="magenta") for e in failed: table.add_row(e[0], str(e[1])) print(table) print(TAB + "> Save incorrectly parsed decision body members... [green][DONE]") decision_body_not_parsed = pd.DataFrame(decision_body_not_parsed) with open( Path(build) / get_log_folder() / f'{title}_decision_body.html', 'w') as f: decision_body_not_parsed.to_html(f)
def run(console, build, title, doc_ids=None, force=False): __console = console global print print = __console.print print(Markdown("- **Step configuration**")) input_folder = os.path.join(build, 'raw', 'raw_cases_info') output_folder = path.join(build, 'raw', 'cases_info') print(TAB + '> Step folder: {}'.format(path.join(build, 'cases_info'))) make_build_folder(console, output_folder, force, strict=False) cases = [] files = [ path.join(input_folder, f) for f in listdir(input_folder) if path.isfile(path.join(input_folder, f)) if '.json' in f ] for p in files: try: with open(p, 'r') as f: content = f.read() index = json.loads(content) cases.extend(index["results"]) except Exception as e: log.info(p, e) cases = [c["columns"] for c in cases] print(Markdown("- **Filter cases**")) cases = filter_cases(cases) print(Markdown("- **Format cases metadata**")) cases = format_cases(console, cases) print(Markdown("- **Generate statistics**")) stats = generate_statistics(cases) with open(path.join(output_folder, 'filter.statistics.json'), 'w') as outfile: json.dump(stats, outfile, indent=4, sort_keys=True) with open(path.join(output_folder, 'raw_cases_info_all.json'), 'w') as outfile: json.dump(cases, outfile, indent=4, sort_keys=True) filtered_cases = [] for c in cases: classes = [] for e in c['conclusion']: if e['type'] in ['violation', 'no-violation']: if 'article' in e: g = e['article'] classes.append('{}:{}'.format( g, 1 if e['type'] == 'violation' else 0)) classes = list(set(classes)) opposed_classes = any([ e for e in classes if e.split(':')[0] + ':' + str(abs(1 - int(e.split(':')[-1]))) in classes ]) if len(classes) > 0 and not opposed_classes: filtered_cases.append(c) outcomes = {} cases_per_articles = {} for c in filtered_cases: ccl = c['conclusion'] for e in ccl: if e['type'] in ['violation', 'no-violation']: if 'article' in e: if e['article'] not in outcomes: outcomes[e['article']] = { 'violation': 0, 'no-violation': 0, 'total': 0 } outcomes[e['article']][e['type']] += 1 outcomes[e['article']]['total'] += 1 if e['article'] not in cases_per_articles: cases_per_articles[e['article']] = [] cases_per_articles[e['article']].append(c) print(Markdown("- **Generate case listing for datasets**")) multilabel_cases = [] multilabel_index = set() with Progress(TAB + "> Generate case info for specific article [IN PROGRESS]", "| {task.fields[progress_array]}", transient=True, console=console) as progress: progress_array = [] def to_str(a): if len(a) == 1: return '[[green]{}[white]]'.format(a[0]) return '[{}{}]'.format( ''.join(['[green]{}[white], '.format(e) for e in a[:-1]]), a[-1]) task = progress.add_task("Generate datasets cases", total=len(outcomes), progress_array="[]") for k in outcomes.keys(): progress_array.append(k) with open( path.join(output_folder, 'raw_cases_info_article_{}.json'.format(k)), 'w') as outfile: json.dump(cases_per_articles[k], outfile, indent=4, sort_keys=True) multilabel_cases.extend(cases_per_articles[k]) for c in cases_per_articles[k]: multilabel_index.add(c['itemid']) progress.update(task, advance=1, progress_array=to_str(progress_array)) print(TAB + "> Generate case info for specific article [green][DONE]", ) multilabel_cases_unique = [] for c in multilabel_cases: if c['itemid'] in multilabel_index: multilabel_cases_unique.append(c) multilabel_index.discard(c['itemid']) with open(path.join(output_folder, 'raw_cases_info_multilabel.json'), 'w') as outfile: json.dump(multilabel_cases_unique, outfile, indent=4, sort_keys=True) print(TAB + "> Generate case info for multilabel dataset [green][DONE]", ) multiclass_index = { } # Key: case ID / Value = number of different dataset it appears in multiclass_cases = [] sorted_outcomes = dict( sorted(outcomes.items(), key=lambda x: x[1]['total'])).keys() for k in sorted_outcomes: for c in cases_per_articles[k]: if c['itemid'] not in multiclass_index: nb_datasets = [ e['article'] for e in c['conclusion'] if 'article' in e ] if len(list(set(nb_datasets))) == 1: for cc in c['conclusion']: if 'article' in cc and cc['article'] == k: c['mc_conclusion'] = [cc] break if 'mc_conclusion' in c: multiclass_index[c['itemid']] = k multiclass_cases.append(c) else: log.info('No article found for {}'.format(c['itemid'])) else: log.info( 'Article {} in {} datasets: {}. Skip for multiclass.'. format(c['itemid'], len(set(nb_datasets)), ','.join(list(set(nb_datasets))))) with open(path.join(output_folder, 'raw_cases_info_multiclass.json'), 'w') as outfile: json.dump(multiclass_cases, outfile, indent=4, sort_keys=True) print(TAB + "> Generate case info for multiclass [green][DONE]", )
def run(console, build, title, doc_ids=None, force=False, update=False): __console = console global print print = __console.print print(Markdown("- **Step configuration**")) input_folder = os.path.join(build, 'raw', 'preprocessed_documents') output_folder = os.path.join(build, 'raw', 'normalized_documents') ngrams_config = {} try: ngrams_config = config()['steps']['normalize']['ngrams'] except Exception as e: print('Cannot retrieve n-grams configuration. Details: {}'.format(e)) exit(5) print(TAB + '> Step folder: {}'.format(output_folder)) make_build_folder(console, output_folder, force, strict=False) files = get_files(doc_ids, input_folder) raw_corpus = [] corpus_id = [] print(Markdown('- **Load documents**')) with Progress( TAB + "> Loading in memory... [IN PROGRESS]", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console ) as progress: task = progress.add_task("Loading...", total=len(files), error="", doc=files[0].split('/')[-1].split('_text_without_conclusion.txt')[0]) for i, p in enumerate(files): error = "" doc_id = p.split('/')[-1].split('_text_without_conclusion.txt')[0] try: raw_corpus.append(load_text_file(p)) corpus_id.append(doc_id) except Exception as e: error = '\n| {}'.format('Could not load the document') log.debug(p, e) progress.update(task, advance=1, error=error, doc=doc_id) print(TAB + "> Loading in memory... [green][DONE]") normalized_tokens = [] print(Markdown('- **Generate language model**')) try: with Progress( TAB + "> Normalize... [IN PROGRESS]\n", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console ) as progress: task = progress.add_task("Compute tokens...", total=len(raw_corpus), error="", doc=corpus_id[0]) for i, doc in enumerate(raw_corpus): filename = os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i])) if not update or not os.path.isfile(filename): normalized_tokens.append(normalized_step(doc, force=force, lemmatization=True)) else: with open(filename, 'r') as f: normalized_tokens.extend(f.read().split()) f.close() progress.update(task, advance=1, error=error, doc=corpus_id[i]) except Exception as e: print(TAB + '[bold red]:double_exclamation_mark: Could not normalized the tokens. Details: {}'.format(e)) exit(40) print(TAB + "> Normalize... [green][DONE]") all_grams = [] doc_grammed = [] try: with Progress( TAB + "> Compute ngrams... [IN PROGRESS]\n", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console ) as progress: task = progress.add_task("Compute tokens...", total=len(corpus_id), error="", doc=corpus_id[0]) for i, doc in enumerate(normalized_tokens): error = "" filename = os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i])) if not update or not os.path.isfile(filename): grams = ngram_step(doc, ngrams_config, force=force) merged = [] for g in grams.values(): merged.extend(g) doc_grammed.append(merged) all_grams.extend(merged) else: error = "\n| Load document as already normalized." with open(filename, 'r') as f: all_grams.extend(f.read().split()) doc_grammed.append(None) f.close() progress.update(task, advance=1, error=error, doc=corpus_id[i]) except Exception: console.print_exception() print(TAB + "> Compute ngrams... [green][DONE]") f = Counter(all_grams) with open(os.path.join(output_folder, 'full_dictionary.txt'), 'w') as outfile: json.dump(f, outfile, indent=4, sort_keys=True) print(TAB + '> Save the full dictionary [green][DONE]') with Progress( TAB + "> Save normalized documents... [IN PROGRESS]\n", BarColumn(30), TimeRemainingColumn(), "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})" "{task.fields[error]}", transient=True, console=console ) as progress: task = progress.add_task("Compute tokens...", total=len(doc_grammed), error="", doc=corpus_id[0]) for i, doc in enumerate(doc_grammed): if doc is not None: with open(os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i])), 'a') as file: file.write(' '.join(doc)) progress.update(task, advance=1, error=error, doc=corpus_id[i]) print(TAB + '> Save normalized documents... [green][DONE]')
def run(console, build, title, doc_ids=None, articles=[], processed_folder='all', force=True): __console = console global print print = __console.print suffix = '_{}'.format(processed_folder) input_file = os.path.join(build, 'raw', 'cases_info', 'raw_cases_info{}.json'.format(suffix)) input_folder = os.path.join(build, 'structured') output_folder = os.path.join(build, 'datasets') input_folder_bow = os.path.join(input_folder, 'bow') print(Markdown("- **Step configuration**")) print(TAB + '> Step folder: {}'.format(output_folder)) make_build_folder(console, output_folder, force, strict=False) # Get the list of cases s.t. we have a BoW and TF-IDF representation files = get_files(doc_ids, input_folder_bow, input_folder) id_list = [f.split('/')[-1].split('_')[0] for f in files] # Read the case info cases = [] try: with open(input_file, 'r') as f: content = f.read() cases = json.loads(content) except Exception as e: print(e) exit(1) # Filter the cases info to keep only the items in id_list cases = [c for c in cases if c['itemid'] in id_list] conclusion_key = 'conclusion' if processed_folder != 'multiclass' else 'mc_conclusion' cases = [c for c in cases if conclusion_key in c] keys = [ "itemid", "respondent", "rank", "applicability", "decisiondate", "doctypebranch", "importance", "introductiondate", "judgementdate", "originatingbody_type", "originatingbody_name", "respondent", "respondentOrderEng", "separateopinion", "typedescription" ] keys_list = [ "article", "documentcollectionid", "externalsources", "extractedappno", "kpthesaurus", "parties", "scl", "representedby" ] feature_index = {k: i for i, k in enumerate(keys + keys_list)} feature_to_value = dict( zip(keys + keys_list, [None] * (len(keys) + len(keys_list)))) for c in cases: for k, v in c.items(): if k in keys: if feature_to_value[k] is None: feature_to_value[k] = set() feature_to_value[k].add(v) if k in keys_list: if feature_to_value[k] is None: feature_to_value[k] = set() feature_to_value[k].update(v) feature_to_encoded = {} count = 0 for k, s in feature_to_value.items(): for v in s: if k in keys: feature_to_encoded[u'{}={}'.format(k, v)] = count elif k in keys_list: feature_to_encoded[u'{}_has_{}'.format(k, v)] = count count += 1 # Encode conclusions outcomes = {} for i, c in enumerate(cases): ccl = c[conclusion_key] for e in ccl: if e['type'] in ['violation', 'no-violation']: if e['base_article'] not in outcomes: outcomes[e['base_article']] = { 'violation': 0, 'no-violation': 0, 'total': 0 } # if e['article'] == '8' and e['type'] == 'no-violation': # print(c['docname']) outcomes[e['base_article']][e['type']] += 1 outcomes[e['base_article']]['total'] += 1 # Determine output encoded_outcomes = {} count = 1 for i, _ in outcomes.items(): encoded_outcomes[i] = count count += 1 offset = len(feature_to_encoded) print(Markdown('- **Generate dataset**')) generate_dataset(cases=cases, keys=keys, keys_list=keys_list, encoded_outcomes=encoded_outcomes, feature_index=feature_index, feature_to_encoded=feature_to_encoded, output_path=output_folder, name=processed_folder, offset=offset, processed_folder=input_folder, filter_classes=None if articles == [] else articles, force=force) shutil.make_archive(output_folder, 'zip', output_folder)