Esempio n. 1
0
def docmodel_tolist_speed_test(dm_path=DOCMODELS_PATH):
    print('Testing docmodel tolist speed')
    timer = Timer()
    r = [dm.metadata_to_dict() for dm in DocModel.docmodel_generator(dm_path)]
    print(
        f'Dm tolist speed test done. Made list with ids from {len(r)} docmodels in {timer.get_run_time()}'
    )
Esempio n. 2
0
def make_metadata_corpusframe(docmodels_path=DOCMODELS_PATH):
    #doctype_cats = load_doctype_cats()
    return pd.DataFrame.from_records([{
        'id':
        dm.get_id(),
        'title':
        dm.get_title(),
        'year':
        dm.get_year(),
        'source':
        dm.get_source(),
        'issn':
        dm.get_issn(),
        'doctype':
        dm.get_doctype(),
        'doctype_cat':
        dm.get_doctype_cat(),
        'primary_subjects':
        dm.get_primary_subjects(),
        'secondary_subjects':
        dm.get_secondary_subjects(),
        'abs_tokens':
        len(flatten_paras(filter_tags_basic(dm.get_abs_tags()))),
        'text_tokens':
        len(flatten_paras(filter_tags_basic(dm.get_text_tags())))
    } for dm in DocModel.docmodel_generator(docmodels_path)],
                                     index='id')
Esempio n. 3
0
def list_paragraphs(docmodels_path=DOCMODELS_PATH):
    """Returns a list of all paragraph names in the corpus, format: docid_paranum"""

    return [
        f'{dm.get_id()}_para{i}'
        for dm in DocModel.docmodel_generator(docmodels_path, vocal=True)
        for i in range(len(dm.get_text_tags()))
    ]
Esempio n. 4
0
def docmodel_read_speed_test(dm_path=DOCMODELS_PATH, max=1000000):
    print('Testing docmodel read speed')
    timer = Timer()
    for i, dm in enumerate(DocModel.docmodel_generator(dm_path)):
        t = dm.to_dict()
        if (i + 1) == max:
            break
    print(
        f'Dm read speed test done. Read title from {max} docmodels in {timer.get_run_time()}'
    )
Esempio n. 5
0
def words_counts_old(docmodels_path=DOCMODELS_PATH,
                     save_to='nva_counts_series.p'):
    c = Counter()
    for dm in DocModel.docmodel_generator(docmodels_path, vocal=True):
        c.update(flatten_paras(filter_tags_nva(dm.get_abs_tags())))
    c = pd.Series(c)
    pickle.dump(c, open(f'data\\{save_to}', 'wb'))
    print(c)
    print(f'len: {len(c)}')
    print((c >= 5).value_counts())
    print((c >= 10).value_counts())
    print((c >= 20).value_counts())
Esempio n. 6
0
def corpusframe_test(dm_path=DOCMODELS_PATH, max=1000):
    print('Testing corpusframe from dicts')
    timer = Timer()
    dl = [
        dm.to_dict()
        for i, dm in enumerate(DocModel.docmodel_generator(dm_path)) if i < max
    ]
    df = pd.DataFrame(dl).set_index('id')
    print(
        f'Done making df from dicts, size: {df.memory_usage(index=True).sum()/(1024**2)} mbs'
    )
    timer.step()
    print(df)
Esempio n. 7
0
def make_lexical_counts_paras_corpusframe(docmodels_path=DOCMODELS_PATH):
    lexicon_words = load_lexicon()
    cols = []
    for dm in DocModel.docmodel_generator(docmodels_path):
        for i, para in enumerate(dm.get_text_tags()):
            cols.append(f'{dm.get_id()}_para{i}')

    df = pd.DataFrame(np.float32(0.0), index=lexicon_words,
                      columns=cols)  # .astype(np.uint8)
    for dm in DocModel.docmodel_generator(docmodels_path):
        for i, para in enumerate(dm.get_text_tags()):
            cnt = Counter(
                [tag.lemma for tag in para if tag.word in lexicon_words])
            df[f'{dm.get_id()}_para{i}'].update(
                pd.Series(cnt, dtype=np.float32))
    '''
    df = pd.DataFrame(index=lexicon_words, dtype=np.float32)
    for dm in DocModel.docmodel_generator(docmodels_path):
        for i, para in enumerate(dm.get_text_tags()):
            df[f'{dm.get_id()}_para{i}'] = df.index.map(Counter([tag.lemma for tag in para if tag.word in lexicon_words]))
    '''
    return df.transpose()
Esempio n. 8
0
def extract_and_tag_docmodel_texts(path):
    """Loads and updates all DocModels in a dir by extracting and tagging abstracts and texts"""

    timer = Timer()
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
    print(f'Starting to extract and tag texts from docmodels at {path}...')
    for i, dm in enumerate(DocModel.docmodel_generator(path)):
        dm.extract_abstract(TRASH_SECTIONS)
        dm.extract_text(TRASH_SECTIONS)
        dm.treetag_abstract(tagger)
        dm.treetag_text(tagger)
        dm.save_to_pickle()
        if (i + 1) % 10000 == 0: print(f'Processed {i+1} docmodels...')
    print(f'Done! Processing time: {timer.get_run_time()}')
Esempio n. 9
0
def make_abs_nva_docterm_corpusframe(docmodels_path=DOCMODELS_PATH):
    accepted_lemmas = pickle.load(open('data\\nva_counts_series.p', 'rb'))
    accepted_lemmas = list(accepted_lemmas[accepted_lemmas >= 10].index)
    df = pd.DataFrame(np.float32(0.0),
                      index=accepted_lemmas,
                      columns=pickle.load(
                          open(CORPUSFRAMES_PATH / 'metadata_corpusframe.p',
                               'rb')).index)  # .astype(np.uint8)
    for dm in DocModel.docmodel_generator(docmodels_path):
        cnt = Counter(
            filter_lemmas(flatten_paras(filter_tags_nva(dm.get_abs_tags())),
                          accepted_lemmas))
        df[dm.get_id()].update(pd.Series(cnt, dtype=np.float32))
    return df.transpose()
Esempio n. 10
0
def create_docmodels_from_xml_corpus(srs_path,
                                     save_path,
                                     extract_metadata=True):
    """Reads XMLs and create DocModel objects. Extracts metadata if asked to, which should usually be the case."""

    timer = Timer()
    print(f'Starting to parse xml files at {srs_path}...')
    for i, filename in enumerate(os.listdir(srs_path)):
        try:
            DocModel(filename,
                     ET.parse(srs_path / filename),
                     save_path,
                     extract_metadata_on_init=extract_metadata)
        except:
            print(f'Error on {filename}')
        if (i + 1) % 10000 == 0: print(f'Parsed {i+1} files...')
    print(f'Done! Parsing time: {timer.get_run_time()}')
    print("Save path : {}".format(save_path))
Esempio n. 11
0
def word_counts(workon='texts',
                docmodels_path=DOCMODELS_PATH,
                min_token_len=1,
                id_list=None,
                tag_list=None):
    """Counts word occurrences in texts or abstracts.

    id_list: list of article ids, will skip ids not in the list. If no list is provided, will run on while corpus.
    tag_list: will only keep tokens with a TT tag in the list. If None, will keep all tags.

    Returns a df with 2 columns, words as index:
        total_occs: total occurences of each word in the parsed texts
        article_counts: number of docs where each words is found at least once
    """

    options = {
        'texts': DocModel.get_text_tags,
        'abstracts': DocModel.get_abs_tags
    }
    assert workon in options, f'Error in word_counts(): "workon" param must be in {options.keys()}'
    fct = options[workon]

    total_occs = Counter()
    article_counts = Counter()

    for dm in DocModel.docmodel_generator(docmodels_path, vocal=True):
        if (id_list is not None) and (dm.get_id() not in id_list):
            continue
        words = [
            tag.lemma for tag in flatten_paras(fct(dm))
            if len(tag.lemma) >= min_token_len and (
                tag_list is None or (tag.pos in tag_list))
        ]
        total_occs.update(words)
        article_counts.update(set(words))

    total_occs_s = pd.Series(total_occs)
    article_counts_s = pd.Series(article_counts)

    return pd.DataFrame({
        'total_occs': total_occs_s,
        'article_counts': article_counts_s
    })
Esempio n. 12
0
def make_lexical_counts_corpusframe(docmodels_path=DOCMODELS_PATH):

    tag_fct = DocModel.get_abs_tags
    lexicon_words = load_lexicon()

    # Creates a df with right dims by loading doc ids from metadata corpusframe. Is faster than building as we go
    with open(CORPUSFRAMES_PATH / 'metadata_corpusframe.p', 'rb') as meta_df:
        df = pd.DataFrame(
            np.float32(0.0),
            index=lexicon_words,
            columns=pickle.load(meta_df).index)  # .astype(np.uint8)

    for dm in DocModel.docmodel_generator(docmodels_path):
        cnt = Counter(
            flatten_paras(
                [[tag.lemma for tag in para if tag.word in lexicon_words]
                 for para in tag_fct(dm)]))
        df[dm.get_id()].update(pd.Series(cnt, dtype=np.float32))

    return df.transpose()
Esempio n. 13
0
def make_coocs_df_3(lexicon, window=5, id_list=None, tag_list=None):
    d = defaultdict(Counter)
    for dm in DocModel.docmodel_generator(DOCMODELS_PATH):
        if (id_list is not None) and (dm.get_id() not in id_list):
            continue
        for para in dm.get_text_tags():
            # para = [tag for tag in para if tag.pos in tag_list]
            para = list(filter(lambda x: x.pos in tag_list, para))
            for i, tag in enumerate(para):
                # if tag.pos in tag_list:
                beg = max(i - window, 0)
                end = i + window + 1
                #if len(tag.lemma) > 2 and not(any(char in SPECIAL_CHARACTERS_EXTENDED for char in tag.lemma)):
                d[tag.lemma].update(map(lambda x: x.lemma, para[beg:end]))
                #d[tag.lemma].update([tag.lemma for tag in para[beg:end] if len(tag.lemma) > 2 and not(any(char in SPECIAL_CHARACTERS_EXTENDED for char in w))])
                #d[tag.lemma].update([tag.lemma for tag in para[beg:end] if tag.lemma in lexicon])

    d = dict(d)
    print(len(d.keys()))
    print(len(lexicon))
    return pd.DataFrame(d, index=lexicon, columns=lexicon)
Esempio n. 14
0
def generate_metadata_from_mappings(docmodels_path,
                                    generate_doctype_cats=True,
                                    generate_primary_subjects=True,
                                    generate_secondary_subjects=True):
    """Generate 'subjects' and 'doctype cats' metadata from mappings loaded from CSV"""

    timer = Timer()

    if generate_doctype_cats:
        with open(DOCTYPE_CATS_CSV_PATH, newline='') as cd_csv:
            doctype_cats_mapping = {n[0]: n[1] for n in csv.reader(cd_csv)}
            print(doctype_cats_mapping)
    if generate_primary_subjects:
        with open(PRIMARY_SUBJECTS_CSV_PATH, newline='') as cd_csv:
            primary_subjects_mapping = {
                n[0]: [n[i] for i in range(1, len(n)) if n[i] != '']
                for n in csv.reader(cd_csv)
            }
            print(primary_subjects_mapping)
    if generate_secondary_subjects:
        with open(SECONDARY_SUBJECTS_CSV_PATH, newline='') as cd_csv:
            secondary_subjects_mapping = {
                n[0]: [n[i] for i in range(1, len(n)) if n[i] != '']
                for n in csv.reader(cd_csv)
            }
            print(secondary_subjects_mapping)

    for dm in DocModel.docmodel_generator(docmodels_path):
        if generate_doctype_cats:
            dm.extract_doctype_cat(doctype_cats_mapping)
        if generate_primary_subjects:
            dm.extract_primary_subjects(primary_subjects_mapping)
        if generate_secondary_subjects:
            dm.extract_secondary_subjects(secondary_subjects_mapping)
        dm.save_to_pickle(docmodels_path / dm.filename)
    print(
        f'Done extracting metadata from csvs. Parsing time: {timer.get_run_time()}'
    )