def docmodel_tolist_speed_test(dm_path=DOCMODELS_PATH): print('Testing docmodel tolist speed') timer = Timer() r = [dm.metadata_to_dict() for dm in DocModel.docmodel_generator(dm_path)] print( f'Dm tolist speed test done. Made list with ids from {len(r)} docmodels in {timer.get_run_time()}' )
def make_metadata_corpusframe(docmodels_path=DOCMODELS_PATH): #doctype_cats = load_doctype_cats() return pd.DataFrame.from_records([{ 'id': dm.get_id(), 'title': dm.get_title(), 'year': dm.get_year(), 'source': dm.get_source(), 'issn': dm.get_issn(), 'doctype': dm.get_doctype(), 'doctype_cat': dm.get_doctype_cat(), 'primary_subjects': dm.get_primary_subjects(), 'secondary_subjects': dm.get_secondary_subjects(), 'abs_tokens': len(flatten_paras(filter_tags_basic(dm.get_abs_tags()))), 'text_tokens': len(flatten_paras(filter_tags_basic(dm.get_text_tags()))) } for dm in DocModel.docmodel_generator(docmodels_path)], index='id')
def list_paragraphs(docmodels_path=DOCMODELS_PATH): """Returns a list of all paragraph names in the corpus, format: docid_paranum""" return [ f'{dm.get_id()}_para{i}' for dm in DocModel.docmodel_generator(docmodels_path, vocal=True) for i in range(len(dm.get_text_tags())) ]
def docmodel_read_speed_test(dm_path=DOCMODELS_PATH, max=1000000): print('Testing docmodel read speed') timer = Timer() for i, dm in enumerate(DocModel.docmodel_generator(dm_path)): t = dm.to_dict() if (i + 1) == max: break print( f'Dm read speed test done. Read title from {max} docmodels in {timer.get_run_time()}' )
def words_counts_old(docmodels_path=DOCMODELS_PATH, save_to='nva_counts_series.p'): c = Counter() for dm in DocModel.docmodel_generator(docmodels_path, vocal=True): c.update(flatten_paras(filter_tags_nva(dm.get_abs_tags()))) c = pd.Series(c) pickle.dump(c, open(f'data\\{save_to}', 'wb')) print(c) print(f'len: {len(c)}') print((c >= 5).value_counts()) print((c >= 10).value_counts()) print((c >= 20).value_counts())
def corpusframe_test(dm_path=DOCMODELS_PATH, max=1000): print('Testing corpusframe from dicts') timer = Timer() dl = [ dm.to_dict() for i, dm in enumerate(DocModel.docmodel_generator(dm_path)) if i < max ] df = pd.DataFrame(dl).set_index('id') print( f'Done making df from dicts, size: {df.memory_usage(index=True).sum()/(1024**2)} mbs' ) timer.step() print(df)
def make_lexical_counts_paras_corpusframe(docmodels_path=DOCMODELS_PATH): lexicon_words = load_lexicon() cols = [] for dm in DocModel.docmodel_generator(docmodels_path): for i, para in enumerate(dm.get_text_tags()): cols.append(f'{dm.get_id()}_para{i}') df = pd.DataFrame(np.float32(0.0), index=lexicon_words, columns=cols) # .astype(np.uint8) for dm in DocModel.docmodel_generator(docmodels_path): for i, para in enumerate(dm.get_text_tags()): cnt = Counter( [tag.lemma for tag in para if tag.word in lexicon_words]) df[f'{dm.get_id()}_para{i}'].update( pd.Series(cnt, dtype=np.float32)) ''' df = pd.DataFrame(index=lexicon_words, dtype=np.float32) for dm in DocModel.docmodel_generator(docmodels_path): for i, para in enumerate(dm.get_text_tags()): df[f'{dm.get_id()}_para{i}'] = df.index.map(Counter([tag.lemma for tag in para if tag.word in lexicon_words])) ''' return df.transpose()
def extract_and_tag_docmodel_texts(path): """Loads and updates all DocModels in a dir by extracting and tagging abstracts and texts""" timer = Timer() tagger = treetaggerwrapper.TreeTagger(TAGLANG='en') print(f'Starting to extract and tag texts from docmodels at {path}...') for i, dm in enumerate(DocModel.docmodel_generator(path)): dm.extract_abstract(TRASH_SECTIONS) dm.extract_text(TRASH_SECTIONS) dm.treetag_abstract(tagger) dm.treetag_text(tagger) dm.save_to_pickle() if (i + 1) % 10000 == 0: print(f'Processed {i+1} docmodels...') print(f'Done! Processing time: {timer.get_run_time()}')
def make_abs_nva_docterm_corpusframe(docmodels_path=DOCMODELS_PATH): accepted_lemmas = pickle.load(open('data\\nva_counts_series.p', 'rb')) accepted_lemmas = list(accepted_lemmas[accepted_lemmas >= 10].index) df = pd.DataFrame(np.float32(0.0), index=accepted_lemmas, columns=pickle.load( open(CORPUSFRAMES_PATH / 'metadata_corpusframe.p', 'rb')).index) # .astype(np.uint8) for dm in DocModel.docmodel_generator(docmodels_path): cnt = Counter( filter_lemmas(flatten_paras(filter_tags_nva(dm.get_abs_tags())), accepted_lemmas)) df[dm.get_id()].update(pd.Series(cnt, dtype=np.float32)) return df.transpose()
def create_docmodels_from_xml_corpus(srs_path, save_path, extract_metadata=True): """Reads XMLs and create DocModel objects. Extracts metadata if asked to, which should usually be the case.""" timer = Timer() print(f'Starting to parse xml files at {srs_path}...') for i, filename in enumerate(os.listdir(srs_path)): try: DocModel(filename, ET.parse(srs_path / filename), save_path, extract_metadata_on_init=extract_metadata) except: print(f'Error on {filename}') if (i + 1) % 10000 == 0: print(f'Parsed {i+1} files...') print(f'Done! Parsing time: {timer.get_run_time()}') print("Save path : {}".format(save_path))
def word_counts(workon='texts', docmodels_path=DOCMODELS_PATH, min_token_len=1, id_list=None, tag_list=None): """Counts word occurrences in texts or abstracts. id_list: list of article ids, will skip ids not in the list. If no list is provided, will run on while corpus. tag_list: will only keep tokens with a TT tag in the list. If None, will keep all tags. Returns a df with 2 columns, words as index: total_occs: total occurences of each word in the parsed texts article_counts: number of docs where each words is found at least once """ options = { 'texts': DocModel.get_text_tags, 'abstracts': DocModel.get_abs_tags } assert workon in options, f'Error in word_counts(): "workon" param must be in {options.keys()}' fct = options[workon] total_occs = Counter() article_counts = Counter() for dm in DocModel.docmodel_generator(docmodels_path, vocal=True): if (id_list is not None) and (dm.get_id() not in id_list): continue words = [ tag.lemma for tag in flatten_paras(fct(dm)) if len(tag.lemma) >= min_token_len and ( tag_list is None or (tag.pos in tag_list)) ] total_occs.update(words) article_counts.update(set(words)) total_occs_s = pd.Series(total_occs) article_counts_s = pd.Series(article_counts) return pd.DataFrame({ 'total_occs': total_occs_s, 'article_counts': article_counts_s })
def make_lexical_counts_corpusframe(docmodels_path=DOCMODELS_PATH): tag_fct = DocModel.get_abs_tags lexicon_words = load_lexicon() # Creates a df with right dims by loading doc ids from metadata corpusframe. Is faster than building as we go with open(CORPUSFRAMES_PATH / 'metadata_corpusframe.p', 'rb') as meta_df: df = pd.DataFrame( np.float32(0.0), index=lexicon_words, columns=pickle.load(meta_df).index) # .astype(np.uint8) for dm in DocModel.docmodel_generator(docmodels_path): cnt = Counter( flatten_paras( [[tag.lemma for tag in para if tag.word in lexicon_words] for para in tag_fct(dm)])) df[dm.get_id()].update(pd.Series(cnt, dtype=np.float32)) return df.transpose()
def make_coocs_df_3(lexicon, window=5, id_list=None, tag_list=None): d = defaultdict(Counter) for dm in DocModel.docmodel_generator(DOCMODELS_PATH): if (id_list is not None) and (dm.get_id() not in id_list): continue for para in dm.get_text_tags(): # para = [tag for tag in para if tag.pos in tag_list] para = list(filter(lambda x: x.pos in tag_list, para)) for i, tag in enumerate(para): # if tag.pos in tag_list: beg = max(i - window, 0) end = i + window + 1 #if len(tag.lemma) > 2 and not(any(char in SPECIAL_CHARACTERS_EXTENDED for char in tag.lemma)): d[tag.lemma].update(map(lambda x: x.lemma, para[beg:end])) #d[tag.lemma].update([tag.lemma for tag in para[beg:end] if len(tag.lemma) > 2 and not(any(char in SPECIAL_CHARACTERS_EXTENDED for char in w))]) #d[tag.lemma].update([tag.lemma for tag in para[beg:end] if tag.lemma in lexicon]) d = dict(d) print(len(d.keys())) print(len(lexicon)) return pd.DataFrame(d, index=lexicon, columns=lexicon)
def generate_metadata_from_mappings(docmodels_path, generate_doctype_cats=True, generate_primary_subjects=True, generate_secondary_subjects=True): """Generate 'subjects' and 'doctype cats' metadata from mappings loaded from CSV""" timer = Timer() if generate_doctype_cats: with open(DOCTYPE_CATS_CSV_PATH, newline='') as cd_csv: doctype_cats_mapping = {n[0]: n[1] for n in csv.reader(cd_csv)} print(doctype_cats_mapping) if generate_primary_subjects: with open(PRIMARY_SUBJECTS_CSV_PATH, newline='') as cd_csv: primary_subjects_mapping = { n[0]: [n[i] for i in range(1, len(n)) if n[i] != ''] for n in csv.reader(cd_csv) } print(primary_subjects_mapping) if generate_secondary_subjects: with open(SECONDARY_SUBJECTS_CSV_PATH, newline='') as cd_csv: secondary_subjects_mapping = { n[0]: [n[i] for i in range(1, len(n)) if n[i] != ''] for n in csv.reader(cd_csv) } print(secondary_subjects_mapping) for dm in DocModel.docmodel_generator(docmodels_path): if generate_doctype_cats: dm.extract_doctype_cat(doctype_cats_mapping) if generate_primary_subjects: dm.extract_primary_subjects(primary_subjects_mapping) if generate_secondary_subjects: dm.extract_secondary_subjects(secondary_subjects_mapping) dm.save_to_pickle(docmodels_path / dm.filename) print( f'Done extracting metadata from csvs. Parsing time: {timer.get_run_time()}' )