コード例 #1
0
def download_edictos(
    data_dir=f"{os.environ['HOME']}/data/corteconstitucional/edictos", ):
    """
    needs to be run several times, some times it claims that it cannot find downloaded pdfs,
    :param data_dir:
    :return:
    """
    url = "https://www.corteconstitucional.gov.co/secretaria/edictos/"
    download_dir = f"{data_dir}/downloads"
    os.makedirs(download_dir, exist_ok=True)

    wd = build_chrome_driver(download_dir, headless=True)
    hrefs = get_hrefs(url, wd)

    old_file = f"{data_dir}/documents.jsonl"
    found_existing_documents = os.path.isfile(old_file)
    if found_existing_documents:
        new_file = old_file.split(".jsonl")[0] + "_updated.jsonl"
        old_docs = list(data_io.read_jsonl(old_file))
    else:
        old_docs = []
        new_file = old_file
    try:
        data_io.write_jsonl(
            new_file, generate_raw_docs(old_docs, hrefs, wd, download_dir))
    except Exception as e:
        traceback.print_exc()
        print("shit happened")
    finally:
        if found_existing_documents:
            shutil.move(new_file, old_file)
コード例 #2
0
def parse_edictos(save=True) -> List:
    data = list(tqdm(generate_edictos()))
    # data = [Edicto(**d) for d in data_io.read_jsonl("edictos.jsonl")]
    unique_data = list(set(data))
    print(len(data))
    print(f"unique: {len(unique_data)}")
    if save:
        data_io.write_jsonl("edictos.jsonl", (asdict(d) for d in data))
    return unique_data
コード例 #3
0
def read_or_process_data(path, limit: int = None):
    tmp_train_data_file = "/tmp/train_data.jsonl.gz"
    tmp_meta_file = "/tmp/meta_data.jsonl"
    if not os.path.exists(tmp_train_data_file):

        train_df, features = read_csvs_build_features(path)
        types = {
            name: typ
            for name, typ in train_df.dtypes.to_dict().items()
            if name in features
        }
        numerical_features = [
            name for name, typ in types.items() if typ == float or typ == int
        ]
        # print(numerical_features)
        categorical_features = [
            name for name, typ in types.items() if typ == str
        ]
        # print(categorical_features)
        input_dim = len(numerical_features) + len(categorical_features)

        def dataframe_to_dicts(df):
            data = [row[1].to_dict() for row in df.iterrows()]
            [d.__delitem__("Date") for d in data]
            return data

        train_data_dicts = dataframe_to_dicts(train_df)
        y_train_list = train_df["target"].tolist()
        [
            d.__setitem__("target", t)
            for d, t in zip(train_data_dicts, y_train_list)
        ]
        data_io.write_jsonl(tmp_train_data_file, train_data_dicts)
        data_io.write_jsonl(
            tmp_meta_file,
            [{
                "numerical_features": numerical_features,
                "categorical_features": categorical_features,
            }],
        )
    else:
        print("loading already processed data")
        train_data_dicts = list(
            data_io.read_jsonl(tmp_train_data_file, limit=limit))
        y_train_list = [d["target"] for d in train_data_dicts]
        meta = list(data_io.read_jsonl(tmp_meta_file))[0]
        numerical_features = meta["numerical_features"]
        categorical_features = meta["categorical_features"]
    # features = categorical_features + numerical_features
    return train_data_dicts, categorical_features, numerical_features
コード例 #4
0
def populate_es_parallel_bulk(
    es, files, es_index_name, es_type, limit=None, num_processes=4, chunk_size=500
):
    dicts_g = (d for file in files for d in read_jsonl(file, limit=limit))

    actions_g = (build_es_action(d, es_index_name, es_type) for d in dicts_g)
    results_g = helpers.parallel_bulk(
        es,
        actions_g,
        thread_count=num_processes,
        queue_size=num_processes,
        chunk_size=chunk_size,
        raise_on_exception=False,
        raise_on_error=False,
    )
    failed_g = (
        pop_exception(d)
        for ok, d in tqdm(results_g)
        if not ok and d.get("create", {}).get("status", 200) != 409
    )
    data_io.write_jsonl("failed.jsonl", failed_g)
コード例 #5
0
def extract_from_edicto(source, string, edicto_num: int, edicto_year):
    edicto_date = parse_edicto_date(string)
    if edicto_date is None:
        data_io.write_jsonl(DEBUG_EDICTO_DATE, [{
            "source": source,
            "text": string
        }],
                            mode="ab")
        return []
    spans = [get_sentencia_span(m) for m in sentencia_pattern.finditer(string)]
    edictos = []
    for k, (start, end, sentencia) in enumerate(spans):
        next_start, _, _ = (spans[k + 1] if k + 1 < len(spans) else
                            (len(string), None, None))
        _, previous_end, _ = spans[k - 1] if k > 0 else (None, 0, None)
        behind_sentencia = string[end:next_start]
        expedientes = extract_expedientes(behind_sentencia)
        if len(expedientes) > 0:
            before_sentencia = string[previous_end:start]
            data_io.write_lines(DEBUG_BEFORE_SENTENCIA,
                                [before_sentencia.replace("\n", "€")],
                                mode="ab")
            date = extract_date(before_sentencia)
            if date is not None:
                edictos.append(
                    Edicto(
                        sentencia,
                        date,
                        edicto_date,
                        edicto_year,
                        expedientes,
                        source,
                        edicto_num,
                    ))
    if len(edictos) != 1:
        data_io.write_jsonl(DEBUG_NO_EDICTO, [{
            "source": source,
            "string": string
        }], "ab")
    return edictos
コード例 #6
0
def prepare_manifest(corpora_dir="/content/corpora", limit=None):

    manifest = "manifest.jsonl"
    manifests = list(Path(corpora_dir).rglob("manifest.jsonl.gz"))
    limit = round(limit / len(manifests)) if limit is not None else None

    def get_file_name(f):
        if "/" in f:
            o = f.split('/')[-1]
        else:
            o = f
        return o

    g = (
        {
            "audio_filepath":
            f"{str(f).replace(f.name, '')}/mp3/{get_file_name(d['audio_file'])}",  #TODO(tilo): just hack for TEDLIUM!
            "duration": d["duration"],
            "text": d["text"],
        } for f in manifests for d in data_io.read_jsonl(str(f), limit=limit))
    data_io.write_jsonl(manifest, g)
    return manifest
コード例 #7
0
        def consumer(file):
            print("%s is doing %s; limit: %d" %
                  (multiprocessing.current_process(), file, limit))

            dicts_g = (d for d in data_io.read_jsonl(file, limit=limit))

            actions_g = (build_es_action(d,
                                         es_index_name,
                                         es_type,
                                         op_type="index") for d in dicts_g)
            results_g = helpers.streaming_bulk(
                es_client,
                actions_g,
                chunk_size=chunk_size,
                yield_ok=True,
                raise_on_error=False,
                raise_on_exception=False,
            )

            failed_g = (pop_exception(d) for ok, d in results_g if not ok)
            data_io.write_jsonl(
                "%s_failed.jsonl" % multiprocessing.current_process(),
                failed_g)
コード例 #8
0
def populate_es_streaming_bulk(
    es_client: Elasticsearch,
    dicts: Iterable[Dict],
    es_index_name: str,
    es_type: str,
    chunk_size: int = 500,
):
    def pop_exception(d):
        d["index"].pop("exception")
        return d

    es_actions_g = (build_es_action(d,
                                    index_name=es_index_name,
                                    es_type=es_type) for d in dicts)
    results_g = helpers.streaming_bulk(
        es_client,
        es_actions_g,
        chunk_size=chunk_size,
        yield_ok=True,
        raise_on_error=True,
    )
    failed_g = (pop_exception(d) for ok, d in tqdm(results_g) if not ok)
    data_io.write_jsonl("failed.jsonl", failed_g)
コード例 #9
0
def scrape_date_range(date_from: date, date_to: date, wd: WebDriver,
                      data_path):
    year_from = date_from.year
    month_from = date_from.month
    day_from = date_from.day

    year_to = date_to.year
    month_to = date_to.month
    day_to = date_to.day

    date_range = f"{year_from}-{month_from}-{day_from}_{year_to}-{month_to}-{day_to}"
    for page in itertools.count(start=0):
        bucket_path = f"{data_path}/{date_range}_{page}"
        url = f"{base_url}/?de=0&se=0&ac=0&ca=0&rs={year_from}%2F{fmt(month_from)}%2F{fmt(day_from)}&re={year_to}%2F{fmt(month_to)}%2F{fmt(day_to)}&st=0&pg={page}"
        bucket_downloads = f"{bucket_path}/downloads"
        os.makedirs(bucket_downloads, exist_ok=True)

        docs_file = f"{bucket_path}/docs.jsonl.gz"
        if os.path.isfile(docs_file):
            continue

        wd.get(url)
        source = wd.page_source
        if "No hay resultados para mostrar en este momento" in source:
            break

        docs = []
        for hit in get_hits(source):
            docs.append(build_doc(wd, hit))
            yield True

        data_io.write_jsonl(docs_file, docs)

        for file_name in os.listdir(download_path):
            shutil.move(os.path.join(download_path, file_name),
                        bucket_downloads)
コード例 #10
0
                      "lte": "%s"
                    }
                  }
                }
              ]
            }
          },
          "size": 20,
          "_source": {
            "excludes": "content"
          }
        }
        """ % (d["file_number"], d["type"], d['date'], d['date'])


if __name__ == "__main__":
    file = home + "/data/cases.json.gz"

    es_client = build_es_client(host="guntherhamachi")
    TYPE = "decision"
    INDEX = "juris"

    def not_found_generator():
        for d in tqdm(data_io.read_jsonl(file)):
            body = build_body(d)
            r = es_client.search(index=INDEX, body=body, size=3)
            if r['hits']['total']['value'] < 1:
                yield d

    data_io.write_jsonl('failed_to_find.jsonl', not_found_generator())
コード例 #11
0
def process(d):
    # d = {k:d[k] for k in ['date','aktenzeichen','zitiervorschlag','entscheidungsdatum','content']}
    return d


if __name__ == "__main__":
    es_client = build_es_client()
    TYPE = "decision"
    INDEX = "juris"
    # fmt: off
    fields = [
        "Orientierungssatz", "Gründe", "Tenor", "Leitsatz",
        "Sonstiger Orientierungssatz", "Abweichende Meinung",
        "Entscheidungsgründe", "Tatbestand", "Sonstiger Kurztext",
        "Sonstiger Langtext"
    ]
    # fmt: on

    body = {"query": {"multi_match": {"query": "185 StGB", "fields": fields}}}
    hits_g = elastic_scan(
        es_client,
        index=INDEX,
        query=body,
        batch_size=100,
    )
    docs = (process(d["_source"]) for d in tqdm(hits_g))

    # docs = (process(d) for d in data_io.read_jsonl('BverfG_juris.jsonl.gz'))
    data_io.write_jsonl("decisions_185_StGB.jsonl.gz", docs)
コード例 #12
0
        for f in FIELDS
    }
    schema = Schema(
        aktenzeichen=ID(stored=True),
        **fields,
    )
    file = "BverfG.jsonl.gz"
    data = ({
        process_field_name(k): v
        for k, v in d.items() if k in FIELDS + ["aktenzeichen"]
    } for d in data_io.read_jsonl(file))
    return schema, data


if __name__ == "__main__":

    INDEX_DIR = "bverfg_index"
    if not os.path.isdir(INDEX_DIR):
        schema, data = build_schema_and_corpus()
        build_index(data, schema, index_dir=INDEX_DIR)
        print("done building corpus")

    ix = index.open_dir(INDEX_DIR)

    with ix.searcher() as searcher:
        qp = MultifieldParser(FIELDS, schema=ix.schema)
        q = qp.parse("185 StGB")
        results = searcher.search(q, limit=None)

        data_io.write_jsonl("result.jsonl", (r.fields() for r in results))
コード例 #13
0
    """
    # First line is the title
    split = re.split(r'\|', content[0].rstrip(), maxsplit=2)
    doc_id = int(split[0])

    stable_id = get_stable_id(doc_id)

    doc_text = split[2]

    # Second line is the abstract
    # Assume these are newline-separated; is this true?
    # Note: some articles do not have abstracts, however they still have this line
    doc_text += ' ' + re.split(r'\|', content[1].rstrip(), maxsplit=2)[2]

    annos = parse_annotations(content)

    return {'PMID':doc_id,
            'stable_id':stable_id,
            'text':doc_text,
            'annos':annos}


if __name__ == '__main__':

    # file_path = os.environ["HOME"]+'/code/NLP/IE/pubtator/download/bioconcepts2pubtatorcentral.offset.sample'
    file_path = os.environ["HOME"]+'/pubtator/download/bioconcepts2pubtatorcentral.offset.gz'
    g = (pubtator_parser(content) for content in doc_generator(file_path,limit=100))
    data_io.write_jsonl('./parsed.jsonl',g)

コード例 #14
0
    tokenize = lambda s: s.split(" ")
    order = 3

    start = time()
    aligned_ngrams = (db.from_sequence(
        refs_hyps, npartitions=4 * 4).map(lambda rh: calc_aligned_ngram_tuples(
            tokenize(rh[0]), tokenize(rh[1]), order)).flatten().map(
                lambda rh: (" ".join(rh[0]), " ".join(rh[1]))))

    def error_rate(ref, hyp_counts: Counter):
        overall_num_erros = sum(v for k, v in hyp_counts.items() if ref != k)
        num_correct = hyp_counts[ref]
        return overall_num_erros / (1 + num_correct)

    result = aligned_ngrams.foldby(
        lambda rh: rh[0],
        lambda total, x: total + Counter([x[1]]),
        initial=Counter(),
        combine=lambda x, y: x + y,
    ).topk(1000, lambda kc: error_rate(*kc))
    counts = result.map(lambda kc:
                        (kc[0], dict(kc[1].most_common(5)))).compute()
    data_io.write_jsonl("ngram_counts.jsonl", counts)

    # pprint(result.compute()) #topk(10,key=lambda )
    # aligned_ngrams.filter(lambda rh: rh[0] != rh[1]).map(json.dumps).to_textfiles(
    #     "processed/erroneous_ngrams_*.jsonl.gz"
    # )
    print(f"took: {time()-start} seconds")
コード例 #15
0
def multi_eval(algos,LOGS_DIR, num_eval=5, num_workers=12):

    """
    evaluating 12 jobs with 1 workers took: 415.78 seconds
    evaluating 12 jobs with 3 workers took: 154.78 seconds
    evaluating 12 jobs with 6 workers took: 91.88 seconds
    evaluating 12 jobs with 12 workers took: 70.68 seconds

    on gunther one gets cuda out of mem error with num_workers>12
    """

    task = PlatoScoreTask(LOGS_DIR = LOGS_DIR)


    jobs = [
        Experiment(
            job_id=get_id(),
            name=build_name(algo, error_sim, two_slots),
            config=build_config(algo, error_sim=error_sim, two_slots=two_slots),
            train_dialogues=td,
            eval_dialogues=1000,
            num_warmup_dialogues=warmupd
        )
        for _ in range(num_eval)
        for error_sim in [False,True]
        for two_slots in [False,True]
        for td in [40000]
        for warmupd in [4000]
        for algo in algos
    ]
    start = time()

    outfile = LOGS_DIR+"/results.jsonl"

    mode = "wb"
    if os.path.isdir(LOGS_DIR):
        results = list(data_io.read_jsonl(outfile))
        done_ids = [e['job_id'] for e in results]
        jobs = [e for e in jobs if e.job_id not in done_ids]
        print('only got %d jobs to do'%len(jobs))
        print([e.job_id for e in jobs])
        mode = "ab"
    else:
        os.makedirs(LOGS_DIR)

    if num_workers > 0:
        num_workers = min(len(jobs),num_workers)
        with WorkerPool(processes=num_workers, task=task, daemons=False) as p:
            processed_jobs = p.process_unordered(jobs)
            data_io.write_jsonl(outfile, processed_jobs, mode=mode)
    else:
        with task as t:
            processed_jobs = [t(job) for job in jobs]
            data_io.write_jsonl(outfile, processed_jobs, mode=mode)

    scoring_runs = list(data_io.read_jsonl(outfile))
    plot_results(scoring_runs,LOGS_DIR)

    print(
        "evaluating %d jobs with %d workers took: %0.2f seconds"
        % (len(jobs), num_workers, time() - start)
    )
コード例 #16
0
                "train": train,
                "test": test
            }.items()
        }
        data = [[token for token, tag in datum] for datum in corpus]
        idx = select_fun(tagger, data)
        return predictions, idx


if __name__ == "__main__":
    import os

    data_supplier = partial(read_conll03_en,
                            path=os.environ["HOME"] + "/data/IE/seqtag_data")
    dataset = data_supplier()

    task = ActiveLearnSpacyCrfSeqTagScoreTask(params=Params(c1=0.5,
                                                            c2=0.0,
                                                            max_it=100),
                                              data_supplier=data_supplier)
    num_folds = 5
    select_funs = [select_by_max_entropy, select_random]
    jobs = [Job(f) for _ in range(num_folds) for f in select_funs]
    num_workers = min(multiprocessing.cpu_count() - 1, len(jobs))
    start = time()
    scores = calc_scores(task, jobs, num_workers)
    duration = time() - start
    print("%d jobs with %d workers took: %0.2f seconds" %
          (len(jobs), num_workers, duration))
    data_io.write_jsonl("scores.jsonl", scores)
コード例 #17
0
from util import data_io
from util.util_methods import merge_dicts

from corteconstitucional.parse_edictos import Edicto
from corteconstitucional.parse_proceso_tables import parse_table


def merge_edictos_proceso_tables(
    edictos: List,
    data_path=f"{os.environ['HOME']}/data/corteconstitucional/procesos_tables"
) -> List:
    raw_data = list(
        data_io.read_json(str(file))
        for file in tqdm(Path(data_path).glob("*.json")))
    print("parse tables")
    table_data = (parse_table(d) for d in raw_data)
    exp2table = {t.expediente: t for t in tqdm(table_data)}
    g = (merge_dicts([
        asdict(e), {
            "tables": [asdict(exp2table[exp]) for exp in e.expedientes]
        }
    ]) for e in edictos)
    merged_data = list(g)
    return merged_data


if __name__ == "__main__":
    edictos = [Edicto(**d) for d in data_io.read_jsonl("edictos.jsonl")]
    merged_data = merge_edictos_proceso_tables(edictos)
    data_io.write_jsonl("/tmp/merged_edictos2tables.jsonl", merged_data)
コード例 #18
0
from password_generator import PasswordGenerator
from util import data_io

if __name__ == '__main__':
    pwo = PasswordGenerator()
    pwo.minlen = 9
    pwo.maxlen = 9

    def build_user(eid, user_name):
        return {'name': user_name, 'password': pwo.generate(), 'id': eid}

    data_io.write_jsonl('annotators.jsonl',
                        (build_user(k + 2, user_name) for k, user_name in
                         enumerate(['Salar', 'Vinicius', 'Tarcisio', 'Tilo'])))
コード例 #19
0
        if limit is not None and counter > limit:
            break
        yield hit

def process(d):
    d = {k:d[k] for k in ['date','aktenzeichen','zitiervorschlag','entscheidungsdatum','content']}
    return d

if __name__ == '__main__':
    es_client = build_es_client(host="gunther")
    TYPE = "decision"
    INDEX = "juris"
    body = {
      "query": {
        "match_phrase": {
          "zitiervorschlag": "BVerfG"
        }
      }
    }
    hits_g = elastic_scan(
        es_client,
        index=INDEX,
        query=body,
        batch_size=100,
        # limit=100,
    )
    docs = (process(d['_source']) for d in tqdm(hits_g))

    # docs = (process(d) for d in data_io.read_jsonl('BverfG_juris.jsonl.gz'))
    data_io.write_jsonl('BverfG_juris_content.jsonl.gz',docs)
コード例 #20
0
                "holiday_type": holiday_type,
            }


def build_date(th, year):
    day_s, month = th[0].text.split(" ")
    day = int(day_s)
    date_s = f"{month} {day} {year}"
    date = datetime.strptime(date_s, "%b %d %Y")
    date_formatted = date.strftime("%m/%d/%Y")
    return date_formatted


def get_holidays(wd, year):
    url = f"https://www.timeanddate.com/holidays/colombia/{year}?hol=1"
    wd.get(url)
    soup = BeautifulSoup(wd.page_source, features="html.parser")
    table = soup.find("section", class_="table-data__table")
    return list(generate(table, year))


if __name__ == "__main__":
    wd = build_chrome_driver("/tmp/", headless=True)
    first_year = 2015
    last_year = 2020
    data_io.write_jsonl(
        "holidays.jsonl",
        (d for year in tqdm(range(first_year, last_year + 1)) for d in get_holidays(wd, year)),
    )
    wd.close()
コード例 #21
0
    count_rows,
)
from tqdm import tqdm
from util import data_io, util_methods

if __name__ == "__main__":
    # file = "sqlite:////home/tilo/tilo-tub/code/DrQA/data/wikipedia/docs.db"
    file = "sqlite:////home/tilo/code/DrQA/data/wikipedia/docs.db"
    base, engine = get_sqlalchemy_base_engine(file)
    tables = get_tables_by_reflection(base.metadata, engine)
    docs_table = tables["documents"]

    with engine.connect() as conn:
        num_rows = count_rows(engine, docs_table)
        num_batches = 8
        batch_size = num_rows // num_batches + 1
        it = iter(tqdm(get_rows(conn, select([docs_table]))))

        def row_gen():
            for k in range(batch_size):
                try:
                    d = next(it)
                except StopIteration as e:  # there is no next element in iterator
                    break
                yield d

        for batch_idx in range(num_batches):
            data_io.write_jsonl(f"drqa_wikipedia_{batch_idx}.jsonl.gz",
                                row_gen(),
                                mode="ab")