Ejemplo n.º 1
0
def run(console, build, title, doc_ids=None, max_documents=-1, force=False):
    """
        Get case information from HUDOC

        :param build: build path
        :type string
        :param: max_documents: maximal number of documents to retrieve
        :type: int
        :param: force: delete and recreate the folder
        :type: bool
    """
    __console = console
    global print
    print = __console.print

    print(Markdown("- **Step configuration**"))
    output_folder = os.path.join(build, 'raw', 'raw_cases_info')
    print(TAB + '> Step folder: {}'.format(output_folder))
    make_build_folder(console, output_folder, force, strict=False)

    print(Markdown("- **Determining the number cases**"))

    if doc_ids:
        _, max_documents = determine_max_documents(BASE_URL, 144579)
        print(TAB + "> Doc ids given")

    else:
        if max_documents == -1:
            print(TAB + "> The total number of documents is not provided")
            with Progress(
                    TextColumn(TAB + "> Querying HUDOC...", justify="right"),
                    StatusColumn({
                        None: '[IN PROGRESS]',
                        0: '[green] [DONE]',
                        1: '[red] [FAILED]'
                    }),
                    transient=True,
                    console=console
            ) as progress:
                task = progress.add_task("Get total number of documents")
                while not progress.finished:
                    rc, max_documents = determine_max_documents(BASE_URL, 144579)  # v1.0.0 value
                    progress.update(task, rc=rc)
    print(TAB + "> The total number of documents to retrieve: {}".format(max_documents))
    print(Markdown("- **Get case information from HUDOC**"))
    get_case_info(console, BASE_URL, max_documents, output_folder)
Ejemplo n.º 2
0
def run(console, build, title, doc_ids=None, force=False, update=False):
    __console = console
    global print
    print = __console.print

    print(Markdown("- **Step configuration**"))
    input_file = os.path.join(build, 'raw', 'cases_info',
                              'raw_cases_info_all.json')
    output_folder = os.path.join(build, 'raw', 'judgments')
    print(TAB +
          '> Step folder: {}'.format(os.path.join(build, 'raw', 'judgments')))
    make_build_folder(console, output_folder, force, strict=False)
    id_list = []
    try:
        with open(input_file, 'r') as f:
            content = f.read()
            cases = json.loads(content)
            id_list = [(i['itemid'], i["application"].startswith("MS WORD"))
                       for i in cases]
    except Exception as e:
        print(e)
        return

    print(Markdown("- **Get documents from HUDOC**"))

    if doc_ids:
        id_list, in_build, not_in_build = get_files(doc_ids, id_list)
        if len(not_in_build):
            print(TAB +
                  '> Failed to download documents: {} '.format(not_in_build))
        if len(id_list):
            print(TAB +
                  '> Documenents: {} downloaded from HUDOC'.format(in_build))
        else:
            print(TAB + "> [red] No documents to download")
            return

    get_documents(console, id_list, output_folder, update, force)
Ejemplo n.º 3
0
def run(console,
        build,
        title,
        limit_tokens,
        doc_ids=None,
        processed_folder='all',
        force=False,
        update=False):
    __console = console
    global print
    print = __console.print

    input_file = os.path.join(
        build, 'raw', 'cases_info',
        'raw_cases_info_{}.json'.format(processed_folder))
    input_folder = os.path.join(build, 'raw', 'normalized_documents')
    output_folder = os.path.join(build, 'structured')
    output_folder_tfidf = os.path.join(output_folder, 'tfidf')
    output_folder_bow = os.path.join(output_folder, 'bow')

    print(Markdown("- **Step configuration**"))
    print(TAB + '> Step folder: {}'.format(output_folder_tfidf))
    make_build_folder(console, output_folder_tfidf, force, strict=False)
    print(TAB + '> Step folder: {}'.format(output_folder_bow))
    make_build_folder(console, output_folder_bow, force, strict=False)

    try:
        config()['steps']['normalize']['ngrams']
    except Exception as e:
        print('Cannot retrieve n-grams configuration. Details: {}'.format(e))
        exit(5)
    print(TAB + '> Read configuration [green][DONE]')

    cases_index = {}
    with open(input_file, 'r') as f:
        content = f.read()
        cases = json.loads(content)
        cases_index = {c['itemid']: i for i, c in enumerate(cases)}
        f.close()

    files = get_files(doc_ids, input_folder, cases_index)

    raw_corpus = []
    corpus_id = []
    print(Markdown('- **Create dictionary**'))
    with Progress(
            TAB + "> Loading in memory... [IN PROGRESS]",
            BarColumn(30),
            TimeRemainingColumn(),
            "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
            "{task.fields[error]}",
            transient=True,
            console=console) as progress:
        task = progress.add_task(
            "Loading...",
            total=len(files),
            error="",
            doc=files[0].split('/')[-1].split('_normalized.txt')[0])
        for i, p in enumerate(files):
            error = ""
            try:
                doc_id = p.split('/')[-1].split('_normalized.txt')[0]
                raw_corpus.append(load_text_file(p).split())
                corpus_id.append(doc_id)
            except Exception as e:
                error = '\n| {}'.format('Could not load the document')
                log.debug(p, e)
            progress.update(task, advance=1, error=error, doc=doc_id)
    print(TAB + "> Loading in memory... [green][DONE]")

    # data = json.load(open('./full_dictionary.txt'))
    f = [t for doc in raw_corpus for t in doc]
    f = Counter(f)
    # Load the raw dictionary
    f = f.most_common(int(limit_tokens))
    words = [w[0] for w in f]

    # dictionary = corpora.Dictionary([all_grams])
    print(TAB + '> Create dictionary')
    dictionary = corpora.Dictionary([words])
    dictionary.save(os.path.join(output_folder, 'dictionary.dict'))
    with open(os.path.join(output_folder, 'feature_to_id.dict'),
              'w') as outfile:
        json.dump(dictionary.token2id, outfile, indent=4, sort_keys=True)
    corpus = [dictionary.doc2bow(text) for text in raw_corpus]
    print(Markdown('- **Create language models**'))
    with Progress(
            TAB + "> Create Bag of Word... [IN PROGRESS]",
            BarColumn(30),
            TimeRemainingColumn(),
            "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
            "{task.fields[error]}",
            transient=True,
            console=console) as progress:
        task = progress.add_task("Loading...",
                                 total=len(corpus),
                                 error="",
                                 doc=corpus_id[0])
        for i, doc in enumerate(corpus):
            error = ""
            filename = os.path.join(output_folder_bow,
                                    '{}_bow.txt'.format(corpus_id[i]))
            # if update and not os.path.isfile(filename):
            with open(filename, 'w') as file:
                for f, v in doc:
                    file.write('{}:{} '.format(f, v))
            progress.update(task, advance=1, error=error, doc=corpus_id[i])
    print(TAB + "> Create Bag of Word... [green][DONE]")

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    with Progress(
            TAB + "> Create TF-IDF... [IN PROGRESS]",
            BarColumn(30),
            TimeRemainingColumn(),
            "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
            "{task.fields[error]}",
            transient=True,
            console=console) as progress:
        task = progress.add_task("Loading...",
                                 total=len(corpus_tfidf),
                                 error="",
                                 doc=corpus_id[0])
        for i, doc in enumerate(corpus_tfidf):
            error = ""
            with open(
                    os.path.join(output_folder_tfidf,
                                 '{}_tfidf.txt'.format(corpus_id[i])),
                    'w') as file:
                for f, v in doc:
                    file.write('{}:{} '.format(f, v))
            progress.update(task, advance=1, error=error, doc=corpus_id[i])
    print(TAB + "> Create TF-IDF... [green][DONE]")
Ejemplo n.º 4
0
def run(console,
        build,
        title,
        doc_ids=None,
        output_prefix='cases',
        force=False):
    __console = console
    global print
    print = __console.print

    print(Markdown("- **Step configuration**"))
    print(TAB + "> Prepare release folder structure")
    paths = ['unstructured', 'structured', 'raw']
    for p in paths:
        make_build_folder(console, os.path.join(build, p), force, strict=False)

    print(Markdown("- **Normalize database**"))
    input_folder = os.path.join(build, 'raw', 'preprocessed_documents')

    start = time.perf_counter()
    cases_files = get_files(doc_ids, input_folder)
    stop = time.perf_counter()

    print(TAB +
          "> Prepare unstructured cases in {:0.4f}s [green][DONE]".format(
              stop - start))
    # Unstructured
    start = time.perf_counter()
    with open(os.path.join(build, 'unstructured', 'cases.json'),
              'w') as outfile:
        outfile.write('[\n')
        for i, f in enumerate(cases_files):
            with open(f) as json_file:
                data = json.load(json_file)
                json.dump(data, outfile, indent=4)
                if i != len(cases_files) - 1:
                    outfile.write(',\n')
        outfile.write('\n]')
    stop = time.perf_counter()

    # Structured
    print(TAB +
          "> Generate flat cases in {:0.4f}s [green][DONE]".format(stop -
                                                                   start))
    start = time.perf_counter()
    flat_cases, representatives, extractedapp, scl, decision_body = format_structured_json(
        cases_files)
    stop = time.perf_counter()
    print(TAB + "> Flat cases size: {}MiB in {:0.4f}s".format(
        sys.getsizeof(flat_cases) / 1000, stop - start))
    schema_hints = {
        'article': {
            'col_type': COL_HINT.HOT_ONE
        },
        'documentcollectionid': {
            'col_type': COL_HINT.HOT_ONE
        },
        'applicability': {
            'col_type': COL_HINT.HOT_ONE
        },
        'paragraphs': {
            'col_type': COL_HINT.HOT_ONE
        },
        'decision_body': {
            'col_type': COL_HINT.HOT_ONE
        },
        'conclusion': {
            'col_type': COL_HINT.HOT_ONE,
            'sub_element': 'flatten'
        }
    }

    output_path = os.path.join(build, 'structured')
    with open(os.path.join(output_path, 'flat_cases.json'), 'w') as outfile:
        json.dump(flat_cases, outfile, indent=4)

    with open(os.path.join(output_path, 'schema_hint.json'), 'w') as outfile:
        json.dump(schema_hints, outfile, indent=4)

    X = flat_cases
    start = time.perf_counter()
    df, schema, flat_schema, flat_type_mapping, flat_domain_mapping = normalize(
        X, schema_hints)
    df.to_json(os.path.join(output_path, '{}.json'.format(output_prefix)),
               orient='records')
    df.to_csv(os.path.join(output_path, '{}.csv'.format(output_prefix)))

    json_files = [('schema', schema.to_schema()),
                  ('flat_schema', flat_schema.as_dict()),
                  ('flat_type_mapping', flat_type_mapping),
                  ('flat_domain_mapping', flat_domain_mapping)]
    for f in json_files:
        with open(
                os.path.join(output_path,
                             '{}_{}.json'.format(output_prefix, f[0])),
                'w') as outfile:
            json.dump(f[1], outfile, indent=4)

    os.remove(os.path.join(output_path, 'flat_cases.json'))
    os.remove(os.path.join(output_path, 'cases_flat_schema.json'))
    os.remove(os.path.join(output_path, 'cases_flat_type_mapping.json'))

    stop = time.perf_counter()

    print(TAB +
          '> Generate appnos matrice in {:0.4f}s [green][DONE]'.format(stop -
                                                                       start))
    matrice_appnos = {}
    for k, v in extractedapp.items():
        matrice_appnos[k] = {e: 1 for e in v['appnos']}
    with open(os.path.join(output_path, 'matrice_appnos.json'),
              'w') as outfile:
        json.dump(matrice_appnos, outfile, indent=4)

    print(TAB + '> Generate scl matrice [green][DONE]')
    matrice_scl = {}
    for k, v in scl.items():
        matrice_scl[k] = {e: 1 for e in v['scl']}
    with open(os.path.join(output_path, 'matrice_scl.json'), 'w') as outfile:
        json.dump(matrice_scl, outfile, indent=4)

    print(TAB + '> Generate representatives matrice [green][DONE]')
    matrice_representedby = {}
    for k, v in representatives.items():
        matrice_representedby[k] = {e: 1 for e in v['representedby']}
    with open(os.path.join(output_path, 'matrice_representatives.json'),
              'w') as outfile:
        json.dump(matrice_representedby, outfile, indent=4)

    print(TAB + '> Generate decision body matrice [green][DONE]')
    matrice_decision_body = {}
    for k, v in decision_body.items():
        matrice_decision_body[k] = {k: v for k, v in v['role'].items()}
    with open(os.path.join(output_path, 'matrice_decision_body.json'),
              'w') as outfile:
        json.dump(matrice_decision_body, outfile, indent=4)

    print(TAB + '> Create archives [green][DONE]')
    # Raw
    shutil.make_archive(os.path.join(build, 'raw', 'judgments'), 'zip',
                        os.path.join(build, 'raw', 'judgments'))

    # All
    from zipfile import ZipFile
    with ZipFile(os.path.join(build, 'all.zip'), 'w') as zipObj:
        # Iterate over all the files in directory
        folders = ['unstructured', 'raw', 'structured']
        for f in folders:
            for folderName, _, filenames in os.walk(os.path.join(build, f)):
                for filename in filenames:
                    if not filename.endswith('.zip'):
                        filePath = os.path.join(folderName, filename)
                        zipObj.write(filePath)
Ejemplo n.º 5
0
def run(console, build, title, doc_ids=None, force=False, update=False):
    __console = console
    global print
    print = __console.print

    print(Markdown("- **Step configuration**"))
    input_file = os.path.join(build, 'raw', 'cases_info',
                              'raw_cases_info_all.json')
    input_folder = os.path.join(build, 'raw', 'judgments')
    output_folder = os.path.join(build, 'raw', 'preprocessed_documents')
    print(TAB + '> Step folder: {}'.format(output_folder))
    make_build_folder(console, output_folder, force, strict=False)

    stats = {'parser_type': {'OLD': 0, 'NEW': 0}}

    with open(input_file, 'r') as f:
        content = f.read()
        cases = json.loads(content)
        if doc_ids:
            cases_index = {
                c['itemid']: i
                for i, c in enumerate(cases) if c['itemid'] in doc_ids
            }
        else:
            cases_index = {c['itemid']: i for i, c in enumerate(cases)}
        f.close()

    correctly_parsed = 0
    failed = []

    files = get_files(doc_ids, input_folder)

    decision_body_not_parsed = []
    print(Markdown('- **Preprocess documents**'))
    with Progress(
            TAB + "> Preprocess documents... [IN PROGRESS]\n",
            BarColumn(30),
            TimeRemainingColumn(),
            "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
            "{task.fields[error]}",
            transient=True,
            console=console) as progress:
        task = progress.add_task("Preprocessing...",
                                 total=len(files),
                                 error="",
                                 doc=files[0].split('/')[-1].split('.')[0])
        for _, p in enumerate(files):
            error = ""
            id_doc = p.split('/')[-1].split('.')[0]
            filename_parsed = os.path.join(output_folder,
                                           '{}_parsed.json'.format(id_doc))
            if not update or not os.path.isfile(filename_parsed):
                try:
                    p_ = update_docx(p)
                    doc = Document(p_)
                    parser = select_parser(doc)
                    stats['parser_type'][parser] += 1
                    if parser == 'NEW':
                        parsed, attachments, db_not_parsed = parse_document(
                            doc, id_doc, build)
                        decision_body_not_parsed.extend(db_not_parsed)
                        parsed.update(cases[cases_index[id_doc]])
                        with open(
                                os.path.join(
                                    output_folder,
                                    '{}_text_without_conclusion.txt'.format(
                                        id_doc)), 'w') as toutfile:
                            toutfile.write(
                                json_to_text(parsed,
                                             text_only=True,
                                             except_section=['conclusion'],
                                             attachments=attachments))
                        parsed['documents'] = ['{}.docx'.format(id_doc)]
                        parsed['content'] = {
                            '{}.docx'.format(id_doc): parsed['elements']
                        }
                        parsed['attachments'] = {
                            '{}.docx'.format(id_doc): attachments
                        }
                        del parsed['elements']
                        with open(filename_parsed, 'w') as outfile:
                            json.dump(parsed,
                                      outfile,
                                      indent=4,
                                      sort_keys=True)
                        correctly_parsed += 1
                    else:
                        raise Exception("OLD parser is not available yet.")
                except Exception as e:
                    # __console.print_exception()
                    failed.append((id_doc, e))
                    error = "\n| Could not preprocess {}".format(id_doc)
                    error += "\n| {}".format(e)
                    log.debug("{} {}".format(p, e))
            else:
                error = '\n| Skip document because it is already processed'
                correctly_parsed += 1
            progress.update(task, advance=1, error=error, doc=id_doc)
    if correctly_parsed == len(files):
        print(TAB + "> Preprocess documents... [green][DONE]")
    else:
        print(TAB + "> Preprocess documents... [yellow][WARNING]")
        print(
            TAB +
            "[bold yellow]:warning: Some documents could not be preprocessed")
        print(TAB + "  [bold yellow]THE FINAL DATABASE WILL BE INCOMPLETE!")
    print(TAB + '> Correctly parsed: {}/{} ({:.4f}%)'.format(
        correctly_parsed, len(files), (100. * correctly_parsed) / len(files)))

    if correctly_parsed != len(files):
        print(TAB + '> List of failed documents:')
        table = Table()
        table.add_column("Case ID", style="cyan", no_wrap=True)
        table.add_column("Error", justify="left", style="magenta")
        for e in failed:
            table.add_row(e[0], str(e[1]))
        print(table)

    print(TAB +
          "> Save incorrectly parsed decision body members... [green][DONE]")
    decision_body_not_parsed = pd.DataFrame(decision_body_not_parsed)
    with open(
            Path(build) / get_log_folder() / f'{title}_decision_body.html',
            'w') as f:
        decision_body_not_parsed.to_html(f)
Ejemplo n.º 6
0
def run(console, build, title, doc_ids=None, force=False):
    __console = console
    global print
    print = __console.print

    print(Markdown("- **Step configuration**"))
    input_folder = os.path.join(build, 'raw', 'raw_cases_info')
    output_folder = path.join(build, 'raw', 'cases_info')
    print(TAB + '> Step folder: {}'.format(path.join(build, 'cases_info')))
    make_build_folder(console, output_folder, force, strict=False)

    cases = []
    files = [
        path.join(input_folder, f) for f in listdir(input_folder)
        if path.isfile(path.join(input_folder, f)) if '.json' in f
    ]
    for p in files:
        try:
            with open(p, 'r') as f:
                content = f.read()
                index = json.loads(content)
                cases.extend(index["results"])
        except Exception as e:
            log.info(p, e)
    cases = [c["columns"] for c in cases]

    print(Markdown("- **Filter cases**"))
    cases = filter_cases(cases)
    print(Markdown("- **Format cases metadata**"))
    cases = format_cases(console, cases)

    print(Markdown("- **Generate statistics**"))
    stats = generate_statistics(cases)

    with open(path.join(output_folder, 'filter.statistics.json'),
              'w') as outfile:
        json.dump(stats, outfile, indent=4, sort_keys=True)

    with open(path.join(output_folder, 'raw_cases_info_all.json'),
              'w') as outfile:
        json.dump(cases, outfile, indent=4, sort_keys=True)

    filtered_cases = []
    for c in cases:
        classes = []
        for e in c['conclusion']:
            if e['type'] in ['violation', 'no-violation']:
                if 'article' in e:
                    g = e['article']
                    classes.append('{}:{}'.format(
                        g, 1 if e['type'] == 'violation' else 0))

        classes = list(set(classes))
        opposed_classes = any([
            e for e in classes if e.split(':')[0] + ':' +
            str(abs(1 - int(e.split(':')[-1]))) in classes
        ])
        if len(classes) > 0 and not opposed_classes:
            filtered_cases.append(c)

    outcomes = {}
    cases_per_articles = {}
    for c in filtered_cases:
        ccl = c['conclusion']
        for e in ccl:
            if e['type'] in ['violation', 'no-violation']:
                if 'article' in e:
                    if e['article'] not in outcomes:
                        outcomes[e['article']] = {
                            'violation': 0,
                            'no-violation': 0,
                            'total': 0
                        }
                    outcomes[e['article']][e['type']] += 1
                    outcomes[e['article']]['total'] += 1
                    if e['article'] not in cases_per_articles:
                        cases_per_articles[e['article']] = []
                    cases_per_articles[e['article']].append(c)

    print(Markdown("- **Generate case listing for datasets**"))
    multilabel_cases = []
    multilabel_index = set()
    with Progress(TAB +
                  "> Generate case info for specific article [IN PROGRESS]",
                  "| {task.fields[progress_array]}",
                  transient=True,
                  console=console) as progress:
        progress_array = []

        def to_str(a):
            if len(a) == 1:
                return '[[green]{}[white]]'.format(a[0])
            return '[{}{}]'.format(
                ''.join(['[green]{}[white], '.format(e) for e in a[:-1]]),
                a[-1])

        task = progress.add_task("Generate datasets cases",
                                 total=len(outcomes),
                                 progress_array="[]")
        for k in outcomes.keys():
            progress_array.append(k)
            with open(
                    path.join(output_folder,
                              'raw_cases_info_article_{}.json'.format(k)),
                    'w') as outfile:
                json.dump(cases_per_articles[k],
                          outfile,
                          indent=4,
                          sort_keys=True)
            multilabel_cases.extend(cases_per_articles[k])
            for c in cases_per_articles[k]:
                multilabel_index.add(c['itemid'])
            progress.update(task,
                            advance=1,
                            progress_array=to_str(progress_array))
    print(TAB + "> Generate case info for specific article [green][DONE]", )
    multilabel_cases_unique = []
    for c in multilabel_cases:
        if c['itemid'] in multilabel_index:
            multilabel_cases_unique.append(c)
            multilabel_index.discard(c['itemid'])

    with open(path.join(output_folder, 'raw_cases_info_multilabel.json'),
              'w') as outfile:
        json.dump(multilabel_cases_unique, outfile, indent=4, sort_keys=True)
    print(TAB + "> Generate case info for multilabel dataset [green][DONE]", )
    multiclass_index = {
    }  # Key: case ID / Value = number of different dataset it appears in
    multiclass_cases = []
    sorted_outcomes = dict(
        sorted(outcomes.items(), key=lambda x: x[1]['total'])).keys()
    for k in sorted_outcomes:
        for c in cases_per_articles[k]:
            if c['itemid'] not in multiclass_index:
                nb_datasets = [
                    e['article'] for e in c['conclusion'] if 'article' in e
                ]
                if len(list(set(nb_datasets))) == 1:
                    for cc in c['conclusion']:
                        if 'article' in cc and cc['article'] == k:
                            c['mc_conclusion'] = [cc]
                            break
                    if 'mc_conclusion' in c:
                        multiclass_index[c['itemid']] = k
                        multiclass_cases.append(c)
                    else:
                        log.info('No article found for {}'.format(c['itemid']))
                else:

                    log.info(
                        'Article {} in {} datasets: {}. Skip for multiclass.'.
                        format(c['itemid'], len(set(nb_datasets)),
                               ','.join(list(set(nb_datasets)))))

    with open(path.join(output_folder, 'raw_cases_info_multiclass.json'),
              'w') as outfile:
        json.dump(multiclass_cases, outfile, indent=4, sort_keys=True)
    print(TAB + "> Generate case info for multiclass [green][DONE]", )
Ejemplo n.º 7
0
def run(console, build, title, doc_ids=None, force=False, update=False):
    __console = console
    global print
    print = __console.print

    print(Markdown("- **Step configuration**"))
    input_folder = os.path.join(build, 'raw', 'preprocessed_documents')
    output_folder = os.path.join(build, 'raw', 'normalized_documents')
    ngrams_config = {}
    try:
        ngrams_config = config()['steps']['normalize']['ngrams']
    except Exception as e:
        print('Cannot retrieve n-grams configuration. Details: {}'.format(e))
        exit(5)

    print(TAB + '> Step folder: {}'.format(output_folder))
    make_build_folder(console, output_folder, force, strict=False)

    files = get_files(doc_ids, input_folder)

    raw_corpus = []
    corpus_id = []
    print(Markdown('- **Load documents**'))
    with Progress(
            TAB + "> Loading in memory... [IN PROGRESS]",
            BarColumn(30),
            TimeRemainingColumn(),
            "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
            "{task.fields[error]}",
            transient=True,
            console=console
    ) as progress:
        task = progress.add_task("Loading...", total=len(files), error="",
                                 doc=files[0].split('/')[-1].split('_text_without_conclusion.txt')[0])
        for i, p in enumerate(files):
            error = ""
            doc_id = p.split('/')[-1].split('_text_without_conclusion.txt')[0]
            try:
                raw_corpus.append(load_text_file(p))
                corpus_id.append(doc_id)
            except Exception as e:
                error = '\n| {}'.format('Could not load the document')
                log.debug(p, e)
            progress.update(task, advance=1, error=error, doc=doc_id)
    print(TAB + "> Loading in memory... [green][DONE]")

    normalized_tokens = []
    print(Markdown('- **Generate language model**'))
    try:
        with Progress(
                TAB + "> Normalize... [IN PROGRESS]\n",
                BarColumn(30),
                TimeRemainingColumn(),
                "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
                "{task.fields[error]}",
                transient=True,
                console=console
        ) as progress:
            task = progress.add_task("Compute tokens...", total=len(raw_corpus), error="", doc=corpus_id[0])
            for i, doc in enumerate(raw_corpus):
                filename = os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i]))
                if not update or not os.path.isfile(filename):
                    normalized_tokens.append(normalized_step(doc, force=force, lemmatization=True))
                else:
                    with open(filename, 'r') as f:
                        normalized_tokens.extend(f.read().split())
                        f.close()
                progress.update(task, advance=1, error=error, doc=corpus_id[i])
    except Exception as e:
        print(TAB + '[bold red]:double_exclamation_mark: Could not normalized the tokens. Details: {}'.format(e))
        exit(40)
    print(TAB + "> Normalize... [green][DONE]")

    all_grams = []
    doc_grammed = []
    try:
        with Progress(
                TAB + "> Compute ngrams... [IN PROGRESS]\n",
                BarColumn(30),
                TimeRemainingColumn(),
                "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
                "{task.fields[error]}",
                transient=True,
                console=console
        ) as progress:
            task = progress.add_task("Compute tokens...", total=len(corpus_id), error="", doc=corpus_id[0])
            for i, doc in enumerate(normalized_tokens):
                error = ""
                filename = os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i]))
                if not update or not os.path.isfile(filename):
                    grams = ngram_step(doc, ngrams_config, force=force)
                    merged = []
                    for g in grams.values():
                        merged.extend(g)
                    doc_grammed.append(merged)
                    all_grams.extend(merged)
                else:
                    error = "\n| Load document as already normalized."
                    with open(filename, 'r') as f:
                        all_grams.extend(f.read().split())
                        doc_grammed.append(None)
                        f.close()
                progress.update(task, advance=1, error=error, doc=corpus_id[i])
    except Exception:
        console.print_exception()
    print(TAB + "> Compute ngrams... [green][DONE]")

    f = Counter(all_grams)
    with open(os.path.join(output_folder, 'full_dictionary.txt'), 'w') as outfile:
        json.dump(f, outfile, indent=4, sort_keys=True)
    print(TAB + '> Save the full dictionary [green][DONE]')

    with Progress(
            TAB + "> Save normalized documents... [IN PROGRESS]\n",
            BarColumn(30),
            TimeRemainingColumn(),
            "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
            "{task.fields[error]}",
            transient=True,
            console=console
    ) as progress:
        task = progress.add_task("Compute tokens...", total=len(doc_grammed), error="", doc=corpus_id[0])
        for i, doc in enumerate(doc_grammed):
            if doc is not None:
                with open(os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i])), 'a') as file:
                    file.write(' '.join(doc))
            progress.update(task, advance=1, error=error, doc=corpus_id[i])
    print(TAB + '> Save normalized documents... [green][DONE]')
Ejemplo n.º 8
0
def run(console,
        build,
        title,
        doc_ids=None,
        articles=[],
        processed_folder='all',
        force=True):
    __console = console
    global print
    print = __console.print

    suffix = '_{}'.format(processed_folder)
    input_file = os.path.join(build, 'raw', 'cases_info',
                              'raw_cases_info{}.json'.format(suffix))
    input_folder = os.path.join(build, 'structured')
    output_folder = os.path.join(build, 'datasets')
    input_folder_bow = os.path.join(input_folder, 'bow')

    print(Markdown("- **Step configuration**"))
    print(TAB + '> Step folder: {}'.format(output_folder))
    make_build_folder(console, output_folder, force, strict=False)

    # Get the list of cases s.t. we have a BoW and TF-IDF representation

    files = get_files(doc_ids, input_folder_bow, input_folder)

    id_list = [f.split('/')[-1].split('_')[0] for f in files]

    # Read the case info
    cases = []
    try:
        with open(input_file, 'r') as f:
            content = f.read()
            cases = json.loads(content)
    except Exception as e:
        print(e)
        exit(1)

    # Filter the cases info to keep only the items in id_list
    cases = [c for c in cases if c['itemid'] in id_list]
    conclusion_key = 'conclusion' if processed_folder != 'multiclass' else 'mc_conclusion'
    cases = [c for c in cases if conclusion_key in c]

    keys = [
        "itemid", "respondent", "rank", "applicability", "decisiondate",
        "doctypebranch", "importance", "introductiondate", "judgementdate",
        "originatingbody_type", "originatingbody_name", "respondent",
        "respondentOrderEng", "separateopinion", "typedescription"
    ]

    keys_list = [
        "article", "documentcollectionid", "externalsources", "extractedappno",
        "kpthesaurus", "parties", "scl", "representedby"
    ]

    feature_index = {k: i for i, k in enumerate(keys + keys_list)}
    feature_to_value = dict(
        zip(keys + keys_list, [None] * (len(keys) + len(keys_list))))
    for c in cases:
        for k, v in c.items():
            if k in keys:
                if feature_to_value[k] is None:
                    feature_to_value[k] = set()
                feature_to_value[k].add(v)
            if k in keys_list:
                if feature_to_value[k] is None:
                    feature_to_value[k] = set()
                feature_to_value[k].update(v)

    feature_to_encoded = {}
    count = 0
    for k, s in feature_to_value.items():
        for v in s:
            if k in keys:
                feature_to_encoded[u'{}={}'.format(k, v)] = count
            elif k in keys_list:
                feature_to_encoded[u'{}_has_{}'.format(k, v)] = count
            count += 1

    # Encode conclusions
    outcomes = {}
    for i, c in enumerate(cases):
        ccl = c[conclusion_key]
        for e in ccl:
            if e['type'] in ['violation', 'no-violation']:
                if e['base_article'] not in outcomes:
                    outcomes[e['base_article']] = {
                        'violation': 0,
                        'no-violation': 0,
                        'total': 0
                    }
                # if e['article'] == '8' and e['type'] == 'no-violation':
                #    print(c['docname'])
                outcomes[e['base_article']][e['type']] += 1
                outcomes[e['base_article']]['total'] += 1
        # Determine output
    encoded_outcomes = {}
    count = 1
    for i, _ in outcomes.items():
        encoded_outcomes[i] = count
        count += 1

    offset = len(feature_to_encoded)

    print(Markdown('- **Generate dataset**'))
    generate_dataset(cases=cases,
                     keys=keys,
                     keys_list=keys_list,
                     encoded_outcomes=encoded_outcomes,
                     feature_index=feature_index,
                     feature_to_encoded=feature_to_encoded,
                     output_path=output_folder,
                     name=processed_folder,
                     offset=offset,
                     processed_folder=input_folder,
                     filter_classes=None if articles == [] else articles,
                     force=force)

    shutil.make_archive(output_folder, 'zip', output_folder)