Exemple #1
0
def generate_html(event_texts, batch_size, n_process=-1, nlp=None, save=False):
    '''Multi-threaded to pipe list-like of strings into spaCy documents and return HTML in a list.
    
    arguments:
    -- event_texts : list of str (e.g. event_text column from events dataframe).
    -- batch_size : number of strings to process per batch on each process (afaik).
    -- n_process : number of workers/threads/cores on CPU (default = -1 : all available threads).
    -- nlp : pre-loaded spaCy language model. If None, load expected default configuration for this task.
    
    returns:
        list of HTML strings
    '''

    if nlp == None:
        from pipeline.preprocessing.text import load_spacy_model
        nlp = load_spacy_model(output_type='doc',
                               trigger_matcher=True,
                               lemmatizer=False,
                               geological_matcher=True,
                               stopword_removal=False,
                               punctuation_removal=False,
                               lemmatize_triggers=True,
                               verbose=False)

    # pipe text to list of docs
    #docs = list(nlp.pipe(event_texts, batch_size=batch_size, n_process=n_process))

    html = [
        display_ent(doc, jupyter=False) for doc in tqdm(
            nlp.pipe(event_texts, batch_size=batch_size, n_process=n_process),
            desc='Rendering spaCy entities as HTML')
    ]

    return html
Exemple #2
0
def load_metadata_with_triggers(nlp=None, num_files=None, data_folder='data'):
    nlp = nlp or load_spacy_model(output_type='text', tokenizer_only=True)
    metadata = load_metadata()
    filenames = metadata.filename.tolist()
    if num_files != None:
        filenames = filenames[:num_files]
    file_triggers = match_triggers(filenames, nlp=nlp, save_json=False)
    data = file_triggers.merge(metadata, on='filename')

    return data
Exemple #3
0
def load_event_data(nlp=None, cols=None, confidence='high'):
    nlp = nlp or load_spacy_model(output_type='text',
                                  lemmatizer=True,
                                  geological_matcher=False,
                                  stopword_removal=False,
                                  punctuation_removal=True,
                                  verbose=False)

    cols = default_event_cols if cols is None else cols
    df = pd.read_csv(f'data/events/events_{confidence}-conf.csv',
                     index_col=0,
                     usecols=cols)

    df['tokens'] = list(nlp.pipe(df.event_text.values))

    return df
Exemple #4
0
def extract_text_chunks(filenames,
                        pad=2,
                        skip_on_trigger=False,
                        tokenize=False,
                        nlp=None,
                        n_process=-1,
                        batch_size=100):
    # if a dictionary is loaded where keys are filenames and values are pre-loaded files, we dont load from disk
    if type(filenames) == dict:
        files = filenames
    else:
        # load report files from disk to extract events on triggers
        files = load_files(filenames,
                           data_path='data/wamex_xml',
                           output='dict')

    if skip_on_trigger:
        # have not implemented skipping over overlapping text chunks
        pass

    if tokenize:
        nlp = nlp or load_spacy_model(output_type='text',
                                      lemmatizer=True,
                                      geological_matcher=False,
                                      stopword_removal=False,
                                      punctuation_removal=True,
                                      lemmatize_triggers=True,
                                      verbose=False)

        files = {
            file: list(
                nlp.pipe(sentences, n_process=n_process,
                         batch_size=batch_size))
            for file, sentences in tqdm(files.items(),
                                        desc='Tokenizing file text')
        }

    return {
        file: [
            ' '.join((pad * [''] + sentences +
                      pad * [''])[idx:(1 + idx + (2 * pad))]).strip()
            for idx in range(len(sentences))
        ]
        for file, sentences in tqdm(files.items(),
                                    desc='Extracting text chunks')
    }
Exemple #5
0
def write_html(df, batch_size=100, n_process=-1, nlp=None):
    '''Multi-threaded to pipe events dataframe into spaCy documents and save pre-rendered .html files in data/html/spacy/'''

    import pandas as pd

    # load test data
    if type(df) == str:  # if filename is passed, load file
        df = pd.read_csv(df, index_col=0)
    assert type(df) == pd.DataFrame

    if nlp == None:
        # load language model
        from pipeline.preprocessing.text import load_spacy_model
        nlp = load_spacy_model(output_type='doc',
                               trigger_matcher=True,
                               lemmatizer=False,
                               geological_matcher=True,
                               stopword_removal=False,
                               punctuation_removal=False,
                               lemmatize_triggers=True,
                               verbose=False)

    event_html = {
        event_id: html
        for event_id, html in zip(
            df.event_id.values,
            generate_html(df.event_text.values,
                          batch_size=batch_size,
                          n_process=n_process))
    }

    for event_id, html in tqdm(
            event_html.items(),
            desc='Saving pre-rendered .html to data/html/spacy/'):
        with open(os.path.join('data', 'html', 'spacy', f'{event_id}.html'),
                  'w+') as f:
            f.write(html)
Exemple #6
0
def build_event_data(datasets: dict,
                     confidence,
                     pad=0,
                     batch_size=100,
                     n_process=6,
                     nlp=None,
                     labelled_ranges=True,
                     group_all_labelled=False,
                     named_entities=None,
                     return_entities=True,
                     geoview=None,
                     capstone_files=None,
                     files=None):

    # load files if files are not provided
    if (type(capstone_files) != pd.DataFrame) | (files == None):
        capstone_files, files = get_report_data(count_sentences=False,
                                                return_files=True)

    # merge datasets provided by individual labellers
    df = merge_datasets(datasets, confidence=confidence)

    # apply the build event text function to build text chunk from labelled sentences
    df.insert(
        6, 'event_text',
        df.apply(lambda row: build_event_text(
            row, pad=2, labelled_ranges=labelled_ranges, files=files),
                 axis=1))

    # insert the event_id natural key which is f'{filename}_{idx}'
    df.insert(
        0, 'event_id',
        df.apply(lambda row: '_'.join(
            [row.filename.rsplit('.', 1)[0],
             str(row.sentence_idx)]),
                 axis=1))

    # return the text chunk start and end positions, or lower_bound/upper_bound
    if labelled_ranges:
        df['lower_idx'] = df['sentence_idx'] + df['lower_bound']
        df['upper_idx'] = df['sentence_idx'] + df['upper_bound']
    else:
        df['lower_idx'] = df['sentence_idx'] - pad
        df['upper_idx'] = df['sentence_idx'] + pad

    df.rename(columns={'triggers': 'sentence_triggers'}, inplace=True)
    df.drop(columns=['lower_bound', 'upper_bound'], inplace=True)

    # load old event labels from group labelling early in semester
    if group_all_labelled and confidence.lower(
    ) != 'high':  # only returns old ones if conf = medium
        old_events = load_group_all_labelled(geoview=geoview,
                                             capstone_files=capstone_files,
                                             files=files)
        final_index = df.index[-1]
        old_events.index = np.arange(df.index[-1],
                                     len(old_events) + df.index[-1])
        df = df.append(old_events)

    # run named entity recognition with spacy on text chunk
    if return_entities:
        nlp = nlp or load_spacy_model(output_type='doc',
                                      trigger_matcher=True,
                                      lemmatizer=False,
                                      geological_matcher=True,
                                      stopword_removal=False,
                                      punctuation_removal=False,
                                      lemmatize_triggers=True)

        named_entities = named_entities or [
            'DATE', 'LOCATION', 'TRIGGER', 'STRAT', 'ROCK', 'LOCATION',
            'MINERAL', 'ORE_DEPOSIT', 'TIMESCALE'
        ]

        # create a list of tuples for each entity in each event id
        event_entities = [
            (event_id, ent.text,
             ent.label_) for event_id, doc in tqdm(
                 zip(
                     df.event_id.values,
                     nlp.pipe(df.event_text.values,
                              batch_size=batch_size,
                              n_process=n_process)),
                 desc=f'Extracting {confidence} confidence events')
            for ent in doc.ents if ent.label_ in named_entities
        ]

        # join entity labels together as a string and then merge onto original dataframe
        df = df.merge(pd.DataFrame(
            data=event_entities,
            columns=['event_id', 'entity', 'label']).groupby([
                'event_id', 'label'
            ]).apply(lambda x: ', '.join(x.entity)).unstack(level='label'),
                      on='event_id',
                      how='left').fillna('')

    assert all(
        files[event.filename][event.sentence_idx] in event.event_text
        for event in df.itertuples(
        )), f'sentences not matched in {confidence} confidence event text'

    return df
Exemple #7
0
    group_all_labelled = args.events

    print(
        f'Building training data from labelled near miss instances by: {" ".join(user for user in users)}'
    )
    print(
        f'Confidence thresholds to label text chunks as near miss: {" ".join(conf for conf in confs)}'
    )
    if group_all_labelled:
        print('Including old labelled training data.')

    # load spacy model
    nlp = load_spacy_model(output_type='doc',
                           trigger_matcher=True,
                           lemmatizer=False,
                           geological_matcher=True,
                           stopword_removal=False,
                           punctuation_removal=False,
                           lemmatize_triggers=True,
                           verbose=False)

    # load files and geoview metadata
    capstone_files, files = get_report_data(count_sentences=True,
                                            return_files=True)

    metadata = pd.read_csv('data/geoview/capstone_metadata.zip',
                           compression='zip',
                           parse_dates=['report_year'],
                           usecols=[
                               'anumber', 'title', 'report_type', 'project',
                               'keywords', 'commodity', 'report_year'
                           ])